{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_rte_ff_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_rte_ff_v1/runs/Sep15_03-15-42_gx08", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 39, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_rte_ff_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": null, "flops": { "eval": 5333250945655808, "train": 9219668431260864, "total": 14552919376916672 }, "total_energy": 0.0, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:48.006519", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 68335452.534585, "timestamp": "2025-09-15 03:15:48.008878", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:48.089573", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.3317909240722656, "timestamp": "2025-09-15 03:15:48.091491", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:48.138774", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.3635498881340027, "timestamp": "2025-09-15 03:15:48.140818", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:48.189865", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.32277998328208923, "timestamp": "2025-09-15 03:15:48.192485", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:48.223345", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.38533610105514526, "timestamp": "2025-09-15 03:15:48.305210", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:48.351018", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.10644926130771637, "timestamp": "2025-09-15 03:15:48.353187", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:48.390429", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.13102366030216217, "timestamp": "2025-09-15 03:15:48.392650", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:48.437578", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.03480641916394234, "timestamp": "2025-09-15 03:15:48.439872", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:48.470535", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.005003761500120163, "timestamp": "2025-09-15 03:15:48.496060", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:48.525945", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.01741660200059414, "timestamp": "2025-09-15 03:15:48.528264", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:48.558378", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.04518657922744751, "timestamp": "2025-09-15 03:15:48.560476", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:48.590824", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.027425434440374374, "timestamp": "2025-09-15 03:15:48.598569", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:48.628755", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.04769379645586014, "timestamp": "2025-09-15 03:15:48.653997", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:48.684250", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.05360250547528267, "timestamp": "2025-09-15 03:15:48.686236", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:48.722020", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.00701676681637764, "timestamp": "2025-09-15 03:15:48.724093", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:48.754567", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.03688133880496025, "timestamp": "2025-09-15 03:15:48.758675", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:48.789361", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.02347908914089203, "timestamp": "2025-09-15 03:15:48.813142", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:48.850146", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.02019873820245266, "timestamp": "2025-09-15 03:15:48.852570", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:48.912330", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.044917989522218704, "timestamp": "2025-09-15 03:15:48.916388", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:48.946767", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.041808683425188065, "timestamp": "2025-09-15 03:15:48.949124", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:48.979593", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.04711863398551941, "timestamp": "2025-09-15 03:15:49.007320", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:49.037809", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.025185493752360344, "timestamp": "2025-09-15 03:15:49.042527", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:49.072760", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.02557338774204254, "timestamp": "2025-09-15 03:15:49.076364", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:49.107336", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.027826517820358276, "timestamp": "2025-09-15 03:15:49.109417", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:49.139561", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.026328999549150467, "timestamp": "2025-09-15 03:15:49.165029", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:49.195700", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.026946408674120903, "timestamp": "2025-09-15 03:15:49.200291", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:15:49.236144", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.03108673170208931, "timestamp": "2025-09-15 03:15:49.240802", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:49.271162", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.0360201820731163, "timestamp": "2025-09-15 03:15:49.275116", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:49.319835", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.033217381685972214, "timestamp": "2025-09-15 03:15:49.343514", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:49.384305", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.026532070711255074, "timestamp": "2025-09-15 03:15:49.386407", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:49.417594", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.02795436605811119, "timestamp": "2025-09-15 03:15:49.419573", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:49.449585", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.032100606709718704, "timestamp": "2025-09-15 03:15:49.453778", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:49.485188", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.027776723727583885, "timestamp": "2025-09-15 03:15:49.509034", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:49.539707", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.01872324012219906, "timestamp": "2025-09-15 03:15:49.544386", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:49.574398", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.020866069942712784, "timestamp": "2025-09-15 03:15:49.581423", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:49.613794", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.035089291632175446, "timestamp": "2025-09-15 03:15:49.618321", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:49.648605", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.03352435305714607, "timestamp": "2025-09-15 03:15:49.672343", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:49.702206", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.025466276332736015, "timestamp": "2025-09-15 03:15:49.704640", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:49.734735", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.032446447759866714, "timestamp": "2025-09-15 03:15:49.737475", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:49.767083", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.022059399634599686, "timestamp": "2025-09-15 03:15:49.769885", "step": 39, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:50.358852", "step": 39, "epoch": 1 }, { "type": "pplx", "content": 52920853.06323647, "timestamp": "2025-09-15 03:15:50.360958", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:50.390249", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.021340548992156982, "timestamp": "2025-09-15 03:15:50.415440", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:50.446315", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.02679264359176159, "timestamp": "2025-09-15 03:15:50.450724", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:50.481270", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.02252059243619442, "timestamp": "2025-09-15 03:15:50.485813", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:50.515860", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.021507279947400093, "timestamp": "2025-09-15 03:15:50.518693", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:50.549161", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.022949328646063805, "timestamp": "2025-09-15 03:15:50.577297", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:50.607047", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.024059386923909187, "timestamp": "2025-09-15 03:15:50.609358", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:50.640049", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.024520207196474075, "timestamp": "2025-09-15 03:15:50.644798", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:50.675390", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.02677883766591549, "timestamp": "2025-09-15 03:15:50.677449", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:50.707369", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.02447492815554142, "timestamp": "2025-09-15 03:15:50.731043", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:50.761421", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.023482685908675194, "timestamp": "2025-09-15 03:15:50.763530", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:50.793696", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.02031794749200344, "timestamp": "2025-09-15 03:15:50.795850", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:50.828078", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.02589653804898262, "timestamp": "2025-09-15 03:15:50.829859", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:50.860142", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.02244141697883606, "timestamp": "2025-09-15 03:15:50.883652", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:50.913689", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.03502468392252922, "timestamp": "2025-09-15 03:15:50.915999", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:50.946182", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.031696025282144547, "timestamp": "2025-09-15 03:15:50.948185", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:50.978060", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.03126489743590355, "timestamp": "2025-09-15 03:15:50.982354", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:51.012793", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.025056706741452217, "timestamp": "2025-09-15 03:15:51.036458", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:51.066709", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.025800740346312523, "timestamp": "2025-09-15 03:15:51.068884", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:51.098933", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.036419469863176346, "timestamp": "2025-09-15 03:15:51.102043", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:51.133067", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.026494473218917847, "timestamp": "2025-09-15 03:15:51.135509", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:51.165529", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.02609933726489544, "timestamp": "2025-09-15 03:15:51.189030", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:51.219232", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.02513437159359455, "timestamp": "2025-09-15 03:15:51.221345", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:51.251723", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.016333062201738358, "timestamp": "2025-09-15 03:15:51.255883", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:51.285617", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.02447478286921978, "timestamp": "2025-09-15 03:15:51.287806", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:51.320461", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.02058406174182892, "timestamp": "2025-09-15 03:15:51.344363", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:51.374879", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.0208949763327837, "timestamp": "2025-09-15 03:15:51.377047", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:51.407319", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.02405490167438984, "timestamp": "2025-09-15 03:15:51.410066", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:51.440150", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.023688603192567825, "timestamp": "2025-09-15 03:15:51.444218", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:51.474348", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.017248744145035744, "timestamp": "2025-09-15 03:15:51.497879", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:51.528228", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.02224724553525448, "timestamp": "2025-09-15 03:15:51.530547", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:51.560979", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.02748028375208378, "timestamp": "2025-09-15 03:15:51.563031", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:51.592871", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.020563840866088867, "timestamp": "2025-09-15 03:15:51.595648", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:51.625709", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.02229272946715355, "timestamp": "2025-09-15 03:15:51.649493", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:51.679161", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.026734307408332825, "timestamp": "2025-09-15 03:15:51.681088", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:51.710478", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.025780189782381058, "timestamp": "2025-09-15 03:15:51.713035", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:51.742738", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.026472801342606544, "timestamp": "2025-09-15 03:15:51.744788", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:51.775551", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.032414261251688004, "timestamp": "2025-09-15 03:15:51.801191", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:51.831424", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.021320512518286705, "timestamp": "2025-09-15 03:15:51.833418", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:51.863813", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.021593904122710228, "timestamp": "2025-09-15 03:15:51.870660", "step": 78, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:52.458394", "step": 78, "epoch": 1 }, { "type": "pplx", "content": 58913638.63486896, "timestamp": "2025-09-15 03:15:52.460527", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:52.490632", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.02848268486559391, "timestamp": "2025-09-15 03:15:52.497856", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:52.528459", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.023239096626639366, "timestamp": "2025-09-15 03:15:52.553583", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:52.584198", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.021547870710492134, "timestamp": "2025-09-15 03:15:52.588513", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:52.618909", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.032910946756601334, "timestamp": "2025-09-15 03:15:52.623547", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:52.654135", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.030047548934817314, "timestamp": "2025-09-15 03:15:52.661792", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:52.693315", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.02654549479484558, "timestamp": "2025-09-15 03:15:52.718488", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:52.751343", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.020856261253356934, "timestamp": "2025-09-15 03:15:52.756220", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:52.786161", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.02457808144390583, "timestamp": "2025-09-15 03:15:52.788914", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:52.818915", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.024185696616768837, "timestamp": "2025-09-15 03:15:52.826735", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:52.856352", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.02541220188140869, "timestamp": "2025-09-15 03:15:52.879910", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:52.910414", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.026249513030052185, "timestamp": "2025-09-15 03:15:52.915050", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:52.945484", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.02385100908577442, "timestamp": "2025-09-15 03:15:52.948291", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:52.978939", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.024936696514487267, "timestamp": "2025-09-15 03:15:52.981821", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.011926", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.024910280480980873, "timestamp": "2025-09-15 03:15:53.037598", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:53.068317", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.02396443299949169, "timestamp": "2025-09-15 03:15:53.070289", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.100698", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.025535132735967636, "timestamp": "2025-09-15 03:15:53.105185", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.134928", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.023879891261458397, "timestamp": "2025-09-15 03:15:53.139811", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:53.170030", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.024169299751520157, "timestamp": "2025-09-15 03:15:53.195202", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:53.224970", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.0213132593780756, "timestamp": "2025-09-15 03:15:53.228309", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:53.258572", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.024025261402130127, "timestamp": "2025-09-15 03:15:53.260512", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:53.292982", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.02572092041373253, "timestamp": "2025-09-15 03:15:53.300802", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:53.330467", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.024027112871408463, "timestamp": "2025-09-15 03:15:53.354365", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:53.384505", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.025775250047445297, "timestamp": "2025-09-15 03:15:53.389232", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.419685", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.027797836810350418, "timestamp": "2025-09-15 03:15:53.424118", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:53.453998", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.02851773239672184, "timestamp": "2025-09-15 03:15:53.456892", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.486813", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.022648418322205544, "timestamp": "2025-09-15 03:15:53.512430", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:15:53.547607", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.029527029022574425, "timestamp": "2025-09-15 03:15:53.553297", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:53.583034", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.026798781007528305, "timestamp": "2025-09-15 03:15:53.585021", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:53.615159", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.027438562363386154, "timestamp": "2025-09-15 03:15:53.619289", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:53.649355", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.026652364060282707, "timestamp": "2025-09-15 03:15:53.672826", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:53.702413", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.019344601780176163, "timestamp": "2025-09-15 03:15:53.704348", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:53.734672", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.029362743720412254, "timestamp": "2025-09-15 03:15:53.738825", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.769329", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.02442595362663269, "timestamp": "2025-09-15 03:15:53.773998", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:53.803955", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.024722302332520485, "timestamp": "2025-09-15 03:15:53.827441", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:53.857473", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.024500925093889236, "timestamp": "2025-09-15 03:15:53.859457", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:53.889697", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.022203342989087105, "timestamp": "2025-09-15 03:15:53.897301", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:53.927337", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.027586471289396286, "timestamp": "2025-09-15 03:15:53.929333", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:53.959884", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.027563441544771194, "timestamp": "2025-09-15 03:15:53.985276", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:54.015606", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.018406938761472702, "timestamp": "2025-09-15 03:15:54.017604", "step": 117, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:54.612600", "step": 117, "epoch": 1 }, { "type": "pplx", "content": 59109284.580648854, "timestamp": "2025-09-15 03:15:54.614935", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:54.644023", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.022183803841471672, "timestamp": "2025-09-15 03:15:54.650679", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:54.680911", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.02371959201991558, "timestamp": "2025-09-15 03:15:54.685557", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-15 03:15:54.726201", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.024669630452990532, "timestamp": "2025-09-15 03:15:54.753551", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:54.784220", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.025222482159733772, "timestamp": "2025-09-15 03:15:54.787092", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:54.817780", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.023705070838332176, "timestamp": "2025-09-15 03:15:54.819784", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:54.850149", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.02747977338731289, "timestamp": "2025-09-15 03:15:54.854848", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:54.885557", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.028290854766964912, "timestamp": "2025-09-15 03:15:54.913617", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:54.943352", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.023249467834830284, "timestamp": "2025-09-15 03:15:54.945356", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:54.975479", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.020255951210856438, "timestamp": "2025-09-15 03:15:54.979941", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:55.010679", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.024496247991919518, "timestamp": "2025-09-15 03:15:55.013241", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:55.042930", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.02648274041712284, "timestamp": "2025-09-15 03:15:55.066591", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:55.097421", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.025107769295573235, "timestamp": "2025-09-15 03:15:55.099702", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:15:55.145696", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.025274476036429405, "timestamp": "2025-09-15 03:15:55.151458", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:55.182612", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.023525172844529152, "timestamp": "2025-09-15 03:15:55.186745", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:55.216811", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.021011391654610634, "timestamp": "2025-09-15 03:15:55.240430", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:55.270318", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.024462955072522163, "timestamp": "2025-09-15 03:15:55.272363", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:55.302730", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.02858763374388218, "timestamp": "2025-09-15 03:15:55.309554", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:55.339946", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.02398851327598095, "timestamp": "2025-09-15 03:15:55.347655", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:55.378900", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.024971574544906616, "timestamp": "2025-09-15 03:15:55.404070", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:55.436106", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.02387789450585842, "timestamp": "2025-09-15 03:15:55.438113", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:55.467922", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.023335954174399376, "timestamp": "2025-09-15 03:15:55.470408", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:55.500276", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.02436818554997444, "timestamp": "2025-09-15 03:15:55.505054", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:55.535284", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.024393929168581963, "timestamp": "2025-09-15 03:15:55.558851", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:55.590050", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.02338075265288353, "timestamp": "2025-09-15 03:15:55.591988", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:15:55.622511", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.02519969828426838, "timestamp": "2025-09-15 03:15:55.630444", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:55.660561", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.022787822410464287, "timestamp": "2025-09-15 03:15:55.662793", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:55.692882", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.02562013640999794, "timestamp": "2025-09-15 03:15:55.716477", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:55.746652", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.027317887172102928, "timestamp": "2025-09-15 03:15:55.748648", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:55.778678", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.021526094526052475, "timestamp": "2025-09-15 03:15:55.780818", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:55.810725", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.025312988087534904, "timestamp": "2025-09-15 03:15:55.812908", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:55.843838", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.023540545254945755, "timestamp": "2025-09-15 03:15:55.869018", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:55.898872", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.02694949321448803, "timestamp": "2025-09-15 03:15:55.900801", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:55.930984", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.023923378437757492, "timestamp": "2025-09-15 03:15:55.935684", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:55.965886", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.02161807008087635, "timestamp": "2025-09-15 03:15:55.968069", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:55.998458", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.021433347836136818, "timestamp": "2025-09-15 03:15:56.022123", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:56.052080", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.019754543900489807, "timestamp": "2025-09-15 03:15:56.054142", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:56.090431", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.028470566496253014, "timestamp": "2025-09-15 03:15:56.094495", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:56.124892", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.022595848888158798, "timestamp": "2025-09-15 03:15:56.128736", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:56.159437", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.024241095408797264, "timestamp": "2025-09-15 03:15:56.187115", "step": 156, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:56.773873", "step": 156, "epoch": 1 }, { "type": "pplx", "content": 61359476.46442279, "timestamp": "2025-09-15 03:15:56.775762", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:56.803488", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.023162925615906715, "timestamp": "2025-09-15 03:15:56.805798", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:56.836621", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.02335996739566326, "timestamp": "2025-09-15 03:15:56.840980", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:56.871586", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.018901418894529343, "timestamp": "2025-09-15 03:15:56.874769", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:56.907194", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.027393018826842308, "timestamp": "2025-09-15 03:15:56.934763", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:56.965969", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.027207305654883385, "timestamp": "2025-09-15 03:15:56.968080", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:56.998453", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.019821634516119957, "timestamp": "2025-09-15 03:15:57.000674", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:57.031445", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.0335393063724041, "timestamp": "2025-09-15 03:15:57.038628", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:57.070562", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.025750983506441116, "timestamp": "2025-09-15 03:15:57.095618", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:57.126733", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.021948251873254776, "timestamp": "2025-09-15 03:15:57.128873", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.159881", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.028559934347867966, "timestamp": "2025-09-15 03:15:57.163644", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.194848", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.021568192169070244, "timestamp": "2025-09-15 03:15:57.198723", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:57.228866", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.025854144245386124, "timestamp": "2025-09-15 03:15:57.252377", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:57.283626", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.02288070134818554, "timestamp": "2025-09-15 03:15:57.288305", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.318847", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.02382759191095829, "timestamp": "2025-09-15 03:15:57.322644", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:57.353391", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.02871537022292614, "timestamp": "2025-09-15 03:15:57.355815", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:57.387190", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.023803409188985825, "timestamp": "2025-09-15 03:15:57.414869", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.445955", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.02843991480767727, "timestamp": "2025-09-15 03:15:57.448143", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:57.479106", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.024328526109457016, "timestamp": "2025-09-15 03:15:57.482545", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:57.514158", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.01945444755256176, "timestamp": "2025-09-15 03:15:57.518490", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:57.549260", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.02223912812769413, "timestamp": "2025-09-15 03:15:57.572972", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:57.603100", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.02585512213408947, "timestamp": "2025-09-15 03:15:57.605148", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:57.636119", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.018943537026643753, "timestamp": "2025-09-15 03:15:57.638209", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.669228", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.021992197260260582, "timestamp": "2025-09-15 03:15:57.673270", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:57.703671", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.022745851427316666, "timestamp": "2025-09-15 03:15:57.727440", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.758530", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.024968743324279785, "timestamp": "2025-09-15 03:15:57.760547", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:15:57.791178", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.02671198733150959, "timestamp": "2025-09-15 03:15:57.798469", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:57.828950", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.02292976714670658, "timestamp": "2025-09-15 03:15:57.830983", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:57.862123", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.021866338327527046, "timestamp": "2025-09-15 03:15:57.885897", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:57.916912", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.022539768368005753, "timestamp": "2025-09-15 03:15:57.918855", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:57.949032", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.024748781695961952, "timestamp": "2025-09-15 03:15:57.952890", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:57.983690", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.025110218673944473, "timestamp": "2025-09-15 03:15:57.985849", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:58.019225", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.024864723905920982, "timestamp": "2025-09-15 03:15:58.046438", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:58.078628", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.020917575806379318, "timestamp": "2025-09-15 03:15:58.080833", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:58.110730", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.022952692583203316, "timestamp": "2025-09-15 03:15:58.114832", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:58.144868", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.026193810626864433, "timestamp": "2025-09-15 03:15:58.146922", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:58.177536", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.024926025420427322, "timestamp": "2025-09-15 03:15:58.200957", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:58.232462", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.024060726165771484, "timestamp": "2025-09-15 03:15:58.234556", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:58.265855", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.023254675790667534, "timestamp": "2025-09-15 03:15:58.269786", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:58.300019", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.021828945726156235, "timestamp": "2025-09-15 03:15:58.302229", "step": 195, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:15:58.890853", "step": 195, "epoch": 1 }, { "type": "pplx", "content": 63240685.34191352, "timestamp": "2025-09-15 03:15:58.892845", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:58.921710", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.02316051349043846, "timestamp": "2025-09-15 03:15:58.946872", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:58.977535", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.021014168858528137, "timestamp": "2025-09-15 03:15:58.980003", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:59.010514", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.02295663394033909, "timestamp": "2025-09-15 03:15:59.012910", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:59.043932", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.020413899794220924, "timestamp": "2025-09-15 03:15:59.050830", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:59.081558", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.027345729991793633, "timestamp": "2025-09-15 03:15:59.109124", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:59.140881", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.02064964361488819, "timestamp": "2025-09-15 03:15:59.145002", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:59.176590", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.02292543835937977, "timestamp": "2025-09-15 03:15:59.180177", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:15:59.210918", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.02036006562411785, "timestamp": "2025-09-15 03:15:59.212954", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:59.244523", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.025549402460455894, "timestamp": "2025-09-15 03:15:59.267999", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:59.298808", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.02759888581931591, "timestamp": "2025-09-15 03:15:59.301063", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:15:59.332024", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.02341906912624836, "timestamp": "2025-09-15 03:15:59.334162", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:59.364829", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.019058359786868095, "timestamp": "2025-09-15 03:15:59.367043", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:15:59.398513", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.023317178711295128, "timestamp": "2025-09-15 03:15:59.426222", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:59.457169", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.02509636990725994, "timestamp": "2025-09-15 03:15:59.459243", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:59.489381", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.02863742969930172, "timestamp": "2025-09-15 03:15:59.491500", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:15:59.522239", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.026905078440904617, "timestamp": "2025-09-15 03:15:59.528504", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:59.559391", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.020303964614868164, "timestamp": "2025-09-15 03:15:59.582997", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:59.613583", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.022942854091525078, "timestamp": "2025-09-15 03:15:59.615904", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:59.646749", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.029736299067735672, "timestamp": "2025-09-15 03:15:59.650708", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:15:59.681125", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.017598429694771767, "timestamp": "2025-09-15 03:15:59.683644", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:15:59.714059", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.020063329488039017, "timestamp": "2025-09-15 03:15:59.737749", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:15:59.768824", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.023120734840631485, "timestamp": "2025-09-15 03:15:59.770897", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:15:59.802112", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.01837027072906494, "timestamp": "2025-09-15 03:15:59.809582", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:15:59.842622", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.02000943198800087, "timestamp": "2025-09-15 03:15:59.846596", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:15:59.877789", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.02292727492749691, "timestamp": "2025-09-15 03:15:59.906404", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:15:59.937325", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.023077070713043213, "timestamp": "2025-09-15 03:15:59.939530", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:15:59.970641", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.023584123700857162, "timestamp": "2025-09-15 03:15:59.980604", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:00.010909", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.023418009281158447, "timestamp": "2025-09-15 03:16:00.017626", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:00.050416", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.023128027096390724, "timestamp": "2025-09-15 03:16:00.077827", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:00.111619", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.025857388973236084, "timestamp": "2025-09-15 03:16:00.118405", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:00.150814", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.02984294295310974, "timestamp": "2025-09-15 03:16:00.155137", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:00.188163", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.023621682077646255, "timestamp": "2025-09-15 03:16:00.192943", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:00.222693", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.023447996005415916, "timestamp": "2025-09-15 03:16:00.246168", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:00.276645", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.016722386702895164, "timestamp": "2025-09-15 03:16:00.281477", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:00.311349", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.01983584091067314, "timestamp": "2025-09-15 03:16:00.314175", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:00.344247", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.02162855677306652, "timestamp": "2025-09-15 03:16:00.346282", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:00.376759", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.0186337660998106, "timestamp": "2025-09-15 03:16:00.405397", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:00.435669", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.014806436374783516, "timestamp": "2025-09-15 03:16:00.441105", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:00.471606", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.025260770693421364, "timestamp": "2025-09-15 03:16:00.473838", "step": 234, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:01.059105", "step": 234, "epoch": 1 }, { "type": "pplx", "content": 68587555.94777116, "timestamp": "2025-09-15 03:16:01.061841", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:01.090645", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.023948902264237404, "timestamp": "2025-09-15 03:16:01.097497", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:01.138216", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.017647096887230873, "timestamp": "2025-09-15 03:16:01.166955", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:01.216207", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.03198563680052757, "timestamp": "2025-09-15 03:16:01.218513", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:01.253481", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.01824996992945671, "timestamp": "2025-09-15 03:16:01.258287", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:01.292269", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.017625119537115097, "timestamp": "2025-09-15 03:16:01.295143", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:01.325745", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.02666931040585041, "timestamp": "2025-09-15 03:16:01.351393", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:01.384605", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.02557450346648693, "timestamp": "2025-09-15 03:16:01.389564", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:01.426453", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.01579933427274227, "timestamp": "2025-09-15 03:16:01.430592", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:01.462033", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.020587289705872536, "timestamp": "2025-09-15 03:16:01.469014", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:01.503272", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.022956201806664467, "timestamp": "2025-09-15 03:16:01.528951", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:01.559007", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.01939821057021618, "timestamp": "2025-09-15 03:16:01.564059", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:01.598428", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.02754567377269268, "timestamp": "2025-09-15 03:16:01.609592", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:01.646882", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.019356006756424904, "timestamp": "2025-09-15 03:16:01.651543", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:01.681678", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.0224157627671957, "timestamp": "2025-09-15 03:16:01.706886", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:01.737442", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.020564500242471695, "timestamp": "2025-09-15 03:16:01.741870", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:01.771817", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.01674218662083149, "timestamp": "2025-09-15 03:16:01.774526", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:01.804875", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.021581009030342102, "timestamp": "2025-09-15 03:16:01.807230", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:01.837827", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.017507784068584442, "timestamp": "2025-09-15 03:16:01.861517", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:01.892258", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.01738118752837181, "timestamp": "2025-09-15 03:16:01.894337", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:01.924438", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.020651668310165405, "timestamp": "2025-09-15 03:16:01.928310", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:01.960365", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.026446225121617317, "timestamp": "2025-09-15 03:16:01.962690", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:02.007777", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.014327983371913433, "timestamp": "2025-09-15 03:16:02.031486", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:02.061650", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.0174112506210804, "timestamp": "2025-09-15 03:16:02.066336", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:02.097433", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.009260174818336964, "timestamp": "2025-09-15 03:16:02.105288", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:02.135256", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.010992279276251793, "timestamp": "2025-09-15 03:16:02.137198", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:02.167718", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.019353589043021202, "timestamp": "2025-09-15 03:16:02.195424", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:02.235178", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.01912912353873253, "timestamp": "2025-09-15 03:16:02.237274", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:02.267743", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.014038893394172192, "timestamp": "2025-09-15 03:16:02.270031", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:02.299986", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.012295400723814964, "timestamp": "2025-09-15 03:16:02.301907", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:02.331731", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.025572476908564568, "timestamp": "2025-09-15 03:16:02.355181", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:02.385263", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.015300482511520386, "timestamp": "2025-09-15 03:16:02.387305", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:02.417616", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.02356751821935177, "timestamp": "2025-09-15 03:16:02.419765", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:02.451581", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.004978724289685488, "timestamp": "2025-09-15 03:16:02.455768", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:02.485952", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.022996727377176285, "timestamp": "2025-09-15 03:16:02.509706", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:02.540134", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.02457096055150032, "timestamp": "2025-09-15 03:16:02.542113", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:02.572318", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.018262630328536034, "timestamp": "2025-09-15 03:16:02.575065", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:02.606054", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.02449517324566841, "timestamp": "2025-09-15 03:16:02.610680", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:02.641699", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.025416718795895576, "timestamp": "2025-09-15 03:16:02.669505", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:02.699836", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.023906651884317398, "timestamp": "2025-09-15 03:16:02.705064", "step": 273, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:03.293915", "step": 273, "epoch": 1 }, { "type": "pplx", "content": 77374753.92709866, "timestamp": "2025-09-15 03:16:03.295807", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:03.324814", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.011430694721639156, "timestamp": "2025-09-15 03:16:03.331480", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:03.360944", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.026782521978020668, "timestamp": "2025-09-15 03:16:03.362990", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:03.393194", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.012034070678055286, "timestamp": "2025-09-15 03:16:03.424345", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:03.454416", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.016790401190519333, "timestamp": "2025-09-15 03:16:03.457660", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:03.488260", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.01346579473465681, "timestamp": "2025-09-15 03:16:03.495872", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:03.526113", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.022128688171505928, "timestamp": "2025-09-15 03:16:03.530737", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:03.561100", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.012642556801438332, "timestamp": "2025-09-15 03:16:03.586256", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:03.616278", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.02572466991841793, "timestamp": "2025-09-15 03:16:03.618722", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:03.653866", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.017196912318468094, "timestamp": "2025-09-15 03:16:03.656866", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:03.689693", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.02792796678841114, "timestamp": "2025-09-15 03:16:03.697318", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:16:03.739770", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.025717342272400856, "timestamp": "2025-09-15 03:16:03.767450", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:03.798160", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.027002578601241112, "timestamp": "2025-09-15 03:16:03.800246", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:03.830992", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.025492871180176735, "timestamp": "2025-09-15 03:16:03.837887", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:03.868229", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.018493708223104477, "timestamp": "2025-09-15 03:16:03.875013", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:03.905566", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.008588691242039204, "timestamp": "2025-09-15 03:16:03.931240", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:03.961557", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.003366441000252962, "timestamp": "2025-09-15 03:16:03.964499", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:03.995942", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.019292891025543213, "timestamp": "2025-09-15 03:16:04.000649", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:04.031152", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.02494696155190468, "timestamp": "2025-09-15 03:16:04.033506", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:04.064036", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.012115180492401123, "timestamp": "2025-09-15 03:16:04.088222", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:04.118593", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.016373474150896072, "timestamp": "2025-09-15 03:16:04.120561", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:04.150255", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.01117027085274458, "timestamp": "2025-09-15 03:16:04.152433", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:04.183598", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.009900221601128578, "timestamp": "2025-09-15 03:16:04.190417", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:04.221134", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.022509068250656128, "timestamp": "2025-09-15 03:16:04.244752", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:04.274919", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.005992523860186338, "timestamp": "2025-09-15 03:16:04.276991", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:04.307606", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.033216144889593124, "timestamp": "2025-09-15 03:16:04.312027", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:04.342252", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.012912544421851635, "timestamp": "2025-09-15 03:16:04.346608", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:04.376479", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.02481652982532978, "timestamp": "2025-09-15 03:16:04.404690", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:04.441606", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.01673789881169796, "timestamp": "2025-09-15 03:16:04.443856", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:04.474536", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.03715262562036514, "timestamp": "2025-09-15 03:16:04.478669", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:04.508637", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.004444826859980822, "timestamp": "2025-09-15 03:16:04.513313", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:04.543214", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.005952953360974789, "timestamp": "2025-09-15 03:16:04.566759", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:04.596758", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.03581325337290764, "timestamp": "2025-09-15 03:16:04.598732", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:04.628891", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.009639021940529346, "timestamp": "2025-09-15 03:16:04.635954", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:04.666569", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.03855990990996361, "timestamp": "2025-09-15 03:16:04.673739", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:04.705876", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.013932475820183754, "timestamp": "2025-09-15 03:16:04.734125", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:04.764417", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.00806758925318718, "timestamp": "2025-09-15 03:16:04.766439", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:04.796896", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.0032700072042644024, "timestamp": "2025-09-15 03:16:04.803803", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:04.834028", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.006026607472449541, "timestamp": "2025-09-15 03:16:04.835941", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:04.865621", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.013435856439173222, "timestamp": "2025-09-15 03:16:04.889411", "step": 312, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:05.473420", "step": 312, "epoch": 1 }, { "type": "pplx", "content": 89022351.57575472, "timestamp": "2025-09-15 03:16:05.475374", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:05.502935", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.036437805742025375, "timestamp": "2025-09-15 03:16:05.505087", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:05.535909", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.05053964629769325, "timestamp": "2025-09-15 03:16:05.538056", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:05.568376", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.017013149335980415, "timestamp": "2025-09-15 03:16:05.572587", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:05.603024", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.002634893637150526, "timestamp": "2025-09-15 03:16:05.628501", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:05.658537", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.005089492071419954, "timestamp": "2025-09-15 03:16:05.660675", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:05.690788", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.004525291733443737, "timestamp": "2025-09-15 03:16:05.693367", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:05.722921", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.016721466556191444, "timestamp": "2025-09-15 03:16:05.724874", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:05.754870", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.032533131539821625, "timestamp": "2025-09-15 03:16:05.778422", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:05.808211", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.004051280207931995, "timestamp": "2025-09-15 03:16:05.810112", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:05.840928", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.007642671465873718, "timestamp": "2025-09-15 03:16:05.848333", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:05.878559", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.004581484943628311, "timestamp": "2025-09-15 03:16:05.883138", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:05.914276", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.009225315414369106, "timestamp": "2025-09-15 03:16:05.939314", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:05.969234", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.0009719752706587315, "timestamp": "2025-09-15 03:16:05.971499", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:06.001728", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.033414021134376526, "timestamp": "2025-09-15 03:16:06.005825", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:06.036288", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.0025679587852209806, "timestamp": "2025-09-15 03:16:06.038522", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:06.068012", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.00764654204249382, "timestamp": "2025-09-15 03:16:06.091669", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:06.123720", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.02005215547978878, "timestamp": "2025-09-15 03:16:06.128928", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:06.158880", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.02345801331102848, "timestamp": "2025-09-15 03:16:06.161011", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:06.190959", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.02312997356057167, "timestamp": "2025-09-15 03:16:06.193407", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:06.224681", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.020947817713022232, "timestamp": "2025-09-15 03:16:06.248142", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:06.278708", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.06012619286775589, "timestamp": "2025-09-15 03:16:06.283450", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:06.330792", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.03723708540201187, "timestamp": "2025-09-15 03:16:06.333611", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:06.363381", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.010374733246862888, "timestamp": "2025-09-15 03:16:06.365496", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:06.395558", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.02376265451312065, "timestamp": "2025-09-15 03:16:06.419128", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:06.448906", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.04077703505754471, "timestamp": "2025-09-15 03:16:06.450933", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:06.480718", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.021582217887043953, "timestamp": "2025-09-15 03:16:06.482904", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:06.513294", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.043330080807209015, "timestamp": "2025-09-15 03:16:06.515453", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:06.545615", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.023107782006263733, "timestamp": "2025-09-15 03:16:06.569241", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:06.599564", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.026244070380926132, "timestamp": "2025-09-15 03:16:06.604508", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:06.634271", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.037476424127817154, "timestamp": "2025-09-15 03:16:06.636382", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:06.666516", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.0057834298349916935, "timestamp": "2025-09-15 03:16:06.670570", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:06.701597", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.01934720017015934, "timestamp": "2025-09-15 03:16:06.725330", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:06.755623", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.005825077183544636, "timestamp": "2025-09-15 03:16:06.760328", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:06.790320", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.008595766499638557, "timestamp": "2025-09-15 03:16:06.794700", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:06.824602", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.010267524980008602, "timestamp": "2025-09-15 03:16:06.827384", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:06.857976", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.011575107462704182, "timestamp": "2025-09-15 03:16:06.883325", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:06.913632", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.02673574723303318, "timestamp": "2025-09-15 03:16:06.915749", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:06.945985", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.0218326635658741, "timestamp": "2025-09-15 03:16:06.953220", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:06.983094", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.0157585758715868, "timestamp": "2025-09-15 03:16:06.987430", "step": 351, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:07.574117", "step": 351, "epoch": 1 }, { "type": "pplx", "content": 80515886.71850136, "timestamp": "2025-09-15 03:16:07.575945", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:07.605506", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.02800087444484234, "timestamp": "2025-09-15 03:16:07.632636", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:07.663026", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.04366505146026611, "timestamp": "2025-09-15 03:16:07.664951", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:07.694873", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.018956486135721207, "timestamp": "2025-09-15 03:16:07.697241", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:07.728686", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.01885378547012806, "timestamp": "2025-09-15 03:16:07.730703", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:07.761101", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.027556609362363815, "timestamp": "2025-09-15 03:16:07.789151", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:07.819784", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.021303247660398483, "timestamp": "2025-09-15 03:16:07.821766", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:07.851821", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.026625454425811768, "timestamp": "2025-09-15 03:16:07.853766", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:07.884845", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.012120525352656841, "timestamp": "2025-09-15 03:16:07.887214", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:07.917751", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.013295002281665802, "timestamp": "2025-09-15 03:16:07.946614", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:07.976683", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.01189348753541708, "timestamp": "2025-09-15 03:16:07.978817", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:08.008998", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.020258372649550438, "timestamp": "2025-09-15 03:16:08.013390", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:08.044661", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.02005624771118164, "timestamp": "2025-09-15 03:16:08.049028", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:08.079058", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.025262445211410522, "timestamp": "2025-09-15 03:16:08.102696", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:08.132990", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.014361625537276268, "timestamp": "2025-09-15 03:16:08.135138", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:08.165036", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.011819976381957531, "timestamp": "2025-09-15 03:16:08.167828", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:08.198223", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.014466187916696072, "timestamp": "2025-09-15 03:16:08.200731", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:08.231093", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.02088986523449421, "timestamp": "2025-09-15 03:16:08.256553", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:08.286505", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.01293105911463499, "timestamp": "2025-09-15 03:16:08.288490", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:08.318654", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.014627523720264435, "timestamp": "2025-09-15 03:16:08.323105", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:08.353213", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.020171448588371277, "timestamp": "2025-09-15 03:16:08.357974", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:08.387988", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.0217885784804821, "timestamp": "2025-09-15 03:16:08.411714", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:08.441612", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.015202087350189686, "timestamp": "2025-09-15 03:16:08.443503", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:08.473812", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.01769433729350567, "timestamp": "2025-09-15 03:16:08.476796", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:08.507228", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.020400600507855415, "timestamp": "2025-09-15 03:16:08.511747", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:08.541899", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.02527294121682644, "timestamp": "2025-09-15 03:16:08.569894", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:08.601274", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.01625806838274002, "timestamp": "2025-09-15 03:16:08.604014", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:08.635058", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.010705024935305119, "timestamp": "2025-09-15 03:16:08.642680", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:08.672820", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.027743404731154442, "timestamp": "2025-09-15 03:16:08.677407", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:08.707603", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.01390306930989027, "timestamp": "2025-09-15 03:16:08.731121", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:08.761695", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.015007426962256432, "timestamp": "2025-09-15 03:16:08.763849", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:08.794662", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.020648064091801643, "timestamp": "2025-09-15 03:16:08.801824", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:08.832571", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.024542948231101036, "timestamp": "2025-09-15 03:16:08.839611", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:08.870954", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.015119172632694244, "timestamp": "2025-09-15 03:16:08.899056", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:08.928788", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.013927723281085491, "timestamp": "2025-09-15 03:16:08.930791", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:08.960837", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.020588034763932228, "timestamp": "2025-09-15 03:16:08.968592", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:08.999144", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.010085856541991234, "timestamp": "2025-09-15 03:16:09.002084", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:09.032030", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.025173211470246315, "timestamp": "2025-09-15 03:16:09.055580", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:09.085476", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.013549414463341236, "timestamp": "2025-09-15 03:16:09.087514", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:09.118462", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.014271484687924385, "timestamp": "2025-09-15 03:16:09.125291", "step": 390, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:09.714210", "step": 390, "epoch": 1 }, { "type": "pplx", "content": 74571334.02457094, "timestamp": "2025-09-15 03:16:09.716103", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:09.745211", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.02279849909245968, "timestamp": "2025-09-15 03:16:09.747530", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:09.777890", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.024111445993185043, "timestamp": "2025-09-15 03:16:09.801520", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:09.832311", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.009041680954396725, "timestamp": "2025-09-15 03:16:09.837023", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:09.869468", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.016649093478918076, "timestamp": "2025-09-15 03:16:09.872050", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:09.902270", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.011015170253813267, "timestamp": "2025-09-15 03:16:09.906656", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:09.937062", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.007471670396625996, "timestamp": "2025-09-15 03:16:09.968302", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:09.998501", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.013137311674654484, "timestamp": "2025-09-15 03:16:10.000642", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:10.031338", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.028483940288424492, "timestamp": "2025-09-15 03:16:10.035614", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:10.066017", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.023605694994330406, "timestamp": "2025-09-15 03:16:10.073789", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:10.104027", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.015281885862350464, "timestamp": "2025-09-15 03:16:10.127551", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.157649", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.013710461556911469, "timestamp": "2025-09-15 03:16:10.159868", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:10.191049", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.03442396968603134, "timestamp": "2025-09-15 03:16:10.197908", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:10.229151", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.016301406547427177, "timestamp": "2025-09-15 03:16:10.236174", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:10.266148", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.025119978934526443, "timestamp": "2025-09-15 03:16:10.290013", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:10.321276", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.01429225504398346, "timestamp": "2025-09-15 03:16:10.328633", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:10.359475", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.004983606748282909, "timestamp": "2025-09-15 03:16:10.363745", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:10.395255", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.02256079576909542, "timestamp": "2025-09-15 03:16:10.402130", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.432333", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.006051547359675169, "timestamp": "2025-09-15 03:16:10.456188", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.489213", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.01480391900986433, "timestamp": "2025-09-15 03:16:10.491200", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.521790", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.010563238523900509, "timestamp": "2025-09-15 03:16:10.523920", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:10.554697", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.014779276214540005, "timestamp": "2025-09-15 03:16:10.558329", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.588689", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.005051398184150457, "timestamp": "2025-09-15 03:16:10.612367", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:10.643269", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.016271298751235008, "timestamp": "2025-09-15 03:16:10.645238", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:10.678636", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.013035394251346588, "timestamp": "2025-09-15 03:16:10.682660", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:10.714060", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.019557014107704163, "timestamp": "2025-09-15 03:16:10.720255", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:10.752032", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.01767909713089466, "timestamp": "2025-09-15 03:16:10.775676", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:10.807108", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.011926579289138317, "timestamp": "2025-09-15 03:16:10.811832", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:10.843174", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.01481988001614809, "timestamp": "2025-09-15 03:16:10.846654", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-15 03:16:10.894362", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.02013516239821911, "timestamp": "2025-09-15 03:16:10.902422", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:10.933117", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.020092064514756203, "timestamp": "2025-09-15 03:16:10.956524", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:10.987665", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.009992966428399086, "timestamp": "2025-09-15 03:16:10.990192", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:11.021351", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.02413918450474739, "timestamp": "2025-09-15 03:16:11.023888", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:11.055527", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.005685184150934219, "timestamp": "2025-09-15 03:16:11.062706", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:11.093177", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.015884699299931526, "timestamp": "2025-09-15 03:16:11.116857", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:11.147706", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.022595107555389404, "timestamp": "2025-09-15 03:16:11.149911", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:11.180806", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.005032018758356571, "timestamp": "2025-09-15 03:16:11.184481", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:11.215158", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.007993181236088276, "timestamp": "2025-09-15 03:16:11.217514", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:11.248219", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.015601440332829952, "timestamp": "2025-09-15 03:16:11.271699", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:11.302226", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.022765683010220528, "timestamp": "2025-09-15 03:16:11.304289", "step": 429, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:11.889127", "step": 429, "epoch": 1 }, { "type": "pplx", "content": 82419156.97382797, "timestamp": "2025-09-15 03:16:11.891298", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:11.920202", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.007856682874262333, "timestamp": "2025-09-15 03:16:11.922209", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:11.953331", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.012552931904792786, "timestamp": "2025-09-15 03:16:11.960394", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:11.990508", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.015140875242650509, "timestamp": "2025-09-15 03:16:12.014126", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:12.044589", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.011779015883803368, "timestamp": "2025-09-15 03:16:12.048748", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:12.079806", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.028670471161603928, "timestamp": "2025-09-15 03:16:12.083634", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:12.114367", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.01725790835916996, "timestamp": "2025-09-15 03:16:12.116305", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:12.146651", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.023671019822359085, "timestamp": "2025-09-15 03:16:12.175083", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:12.206069", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.002824623603373766, "timestamp": "2025-09-15 03:16:12.208020", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:12.238555", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.012223334051668644, "timestamp": "2025-09-15 03:16:12.245523", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.276347", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.010377810336649418, "timestamp": "2025-09-15 03:16:12.278469", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:12.309576", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.02659183368086815, "timestamp": "2025-09-15 03:16:12.333236", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:12.364400", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.006721060257405043, "timestamp": "2025-09-15 03:16:12.368362", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:12.400872", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.035596225410699844, "timestamp": "2025-09-15 03:16:12.402912", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:12.434756", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.0020581563003361225, "timestamp": "2025-09-15 03:16:12.441807", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.472652", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.007875720039010048, "timestamp": "2025-09-15 03:16:12.496083", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.527247", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.009889481589198112, "timestamp": "2025-09-15 03:16:12.529371", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.560602", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.03284718841314316, "timestamp": "2025-09-15 03:16:12.562823", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:12.593710", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.01849648542702198, "timestamp": "2025-09-15 03:16:12.596021", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:12.626581", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.03222808614373207, "timestamp": "2025-09-15 03:16:12.650479", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:12.681732", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.011861540377140045, "timestamp": "2025-09-15 03:16:12.683818", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:12.714390", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.014407354407012463, "timestamp": "2025-09-15 03:16:12.718207", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:12.748950", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.005681209731847048, "timestamp": "2025-09-15 03:16:12.751246", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.781759", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.03094957210123539, "timestamp": "2025-09-15 03:16:12.805505", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:12.836427", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.004415884148329496, "timestamp": "2025-09-15 03:16:12.838703", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:12.869797", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.023646658286452293, "timestamp": "2025-09-15 03:16:12.871917", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:12.902870", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.01825256459414959, "timestamp": "2025-09-15 03:16:12.906481", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:12.937048", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.014448435045778751, "timestamp": "2025-09-15 03:16:12.964490", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:12.995091", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.003266042796894908, "timestamp": "2025-09-15 03:16:13.002712", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:13.032840", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.010741782374680042, "timestamp": "2025-09-15 03:16:13.039677", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:13.070181", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.023588094860315323, "timestamp": "2025-09-15 03:16:13.072505", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:13.102950", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.013890265487134457, "timestamp": "2025-09-15 03:16:13.126602", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:13.157332", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.03365600109100342, "timestamp": "2025-09-15 03:16:13.159365", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:13.189741", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.011849649250507355, "timestamp": "2025-09-15 03:16:13.191833", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:13.222366", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.013702219352126122, "timestamp": "2025-09-15 03:16:13.224423", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:13.254781", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.004361593630164862, "timestamp": "2025-09-15 03:16:13.278669", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:13.311353", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.010045178234577179, "timestamp": "2025-09-15 03:16:13.313416", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:13.343864", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.0036023962311446667, "timestamp": "2025-09-15 03:16:13.345903", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:13.376424", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.0018648395780473948, "timestamp": "2025-09-15 03:16:13.378860", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:13.409600", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.01642528362572193, "timestamp": "2025-09-15 03:16:13.433408", "step": 468, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:14.018984", "step": 468, "epoch": 1 }, { "type": "pplx", "content": 89994610.81027079, "timestamp": "2025-09-15 03:16:14.020991", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:14.049924", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.02225838229060173, "timestamp": "2025-09-15 03:16:14.051901", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:14.082434", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.012944989837706089, "timestamp": "2025-09-15 03:16:14.084750", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.116114", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.0015818104147911072, "timestamp": "2025-09-15 03:16:14.118321", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:14.148756", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.01057495642453432, "timestamp": "2025-09-15 03:16:14.173721", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:14.204761", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.03044239804148674, "timestamp": "2025-09-15 03:16:14.209014", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:14.239329", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.02977249026298523, "timestamp": "2025-09-15 03:16:14.241306", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:14.271617", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.0017273669363930821, "timestamp": "2025-09-15 03:16:14.275784", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:14.307221", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.025188205763697624, "timestamp": "2025-09-15 03:16:14.334889", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:14.365106", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.03656993433833122, "timestamp": "2025-09-15 03:16:14.367088", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.397260", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.03108203411102295, "timestamp": "2025-09-15 03:16:14.399261", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:14.430904", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.03353414312005043, "timestamp": "2025-09-15 03:16:14.434707", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.464683", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.003992644604295492, "timestamp": "2025-09-15 03:16:14.488349", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:14.519279", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.006703828927129507, "timestamp": "2025-09-15 03:16:14.521283", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:14.552148", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.0031639602966606617, "timestamp": "2025-09-15 03:16:14.555726", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:14.586911", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.03814245015382767, "timestamp": "2025-09-15 03:16:14.590632", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:14.621214", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.017492132261395454, "timestamp": "2025-09-15 03:16:14.644981", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:14.676050", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.022614041343331337, "timestamp": "2025-09-15 03:16:14.683270", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:14.713773", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.011417665518820286, "timestamp": "2025-09-15 03:16:14.715931", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:14.747103", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.010924013331532478, "timestamp": "2025-09-15 03:16:14.749320", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.780206", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.013234901241958141, "timestamp": "2025-09-15 03:16:14.803725", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:14.834583", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.006062183063477278, "timestamp": "2025-09-15 03:16:14.836613", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:14.866954", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.009334629401564598, "timestamp": "2025-09-15 03:16:14.870820", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.901669", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.015576253645122051, "timestamp": "2025-09-15 03:16:14.903654", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:14.935197", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.006126120686531067, "timestamp": "2025-09-15 03:16:14.958725", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:14.990060", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.0050699966959655285, "timestamp": "2025-09-15 03:16:14.995151", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:15.026200", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.015693241730332375, "timestamp": "2025-09-15 03:16:15.029936", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:15.060515", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.009532543830573559, "timestamp": "2025-09-15 03:16:15.062630", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:15.092879", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.02666117250919342, "timestamp": "2025-09-15 03:16:15.118090", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:15.148832", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.03242836147546768, "timestamp": "2025-09-15 03:16:15.150824", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:15.181371", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.020675910636782646, "timestamp": "2025-09-15 03:16:15.188732", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:15.219467", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.018618302419781685, "timestamp": "2025-09-15 03:16:15.221549", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:15.252547", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.021370384842157364, "timestamp": "2025-09-15 03:16:15.279935", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-09-15 03:16:21.950577", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:21.988100", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.0373503677546978, "timestamp": "2025-09-15 03:16:21.990113", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:22.024094", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.0026365232188254595, "timestamp": "2025-09-15 03:16:22.026097", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:16:22.057196", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.01669916883111, "timestamp": "2025-09-15 03:16:22.068808", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:22.099620", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.023004433140158653, "timestamp": "2025-09-15 03:16:22.124198", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:22.154621", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.03475857526063919, "timestamp": "2025-09-15 03:16:22.156604", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:22.187632", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.027279671281576157, "timestamp": "2025-09-15 03:16:22.191592", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:22.224447", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.010024458169937134, "timestamp": "2025-09-15 03:16:22.231492", "step": 507, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:22.823830", "step": 507, "epoch": 1 }, { "type": "pplx", "content": 91882933.03724283, "timestamp": "2025-09-15 03:16:22.826008", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:22.855877", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.03937874361872673, "timestamp": "2025-09-15 03:16:22.879687", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:22.910646", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.009307787753641605, "timestamp": "2025-09-15 03:16:22.912493", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:22.943593", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.007812165655195713, "timestamp": "2025-09-15 03:16:22.949903", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:22.980832", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.014223689213395119, "timestamp": "2025-09-15 03:16:22.982851", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:23.013914", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.034484755247831345, "timestamp": "2025-09-15 03:16:23.038861", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:23.070240", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.051926709711551666, "timestamp": "2025-09-15 03:16:23.075215", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:23.105999", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.0070337154902517796, "timestamp": "2025-09-15 03:16:23.109788", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:23.141717", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.0057702637277543545, "timestamp": "2025-09-15 03:16:23.148368", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-15 03:16:23.193173", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.010492673143744469, "timestamp": "2025-09-15 03:16:23.217013", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:23.248013", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.020993739366531372, "timestamp": "2025-09-15 03:16:23.249895", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:23.281199", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.005516039673238993, "timestamp": "2025-09-15 03:16:23.287803", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:23.318976", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.014884904026985168, "timestamp": "2025-09-15 03:16:23.322711", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:23.353328", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.016969164833426476, "timestamp": "2025-09-15 03:16:23.376911", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:23.407828", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.017451364547014236, "timestamp": "2025-09-15 03:16:23.410920", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:23.442614", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.0072836605831980705, "timestamp": "2025-09-15 03:16:23.444596", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:23.475148", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.03784263879060745, "timestamp": "2025-09-15 03:16:23.479224", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:23.510259", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.009146297350525856, "timestamp": "2025-09-15 03:16:23.537615", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:23.568894", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.014295176602900028, "timestamp": "2025-09-15 03:16:23.570854", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:23.602323", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.013233968988060951, "timestamp": "2025-09-15 03:16:23.606343", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:23.637385", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.023095833137631416, "timestamp": "2025-09-15 03:16:23.643916", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:23.674712", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.009187877178192139, "timestamp": "2025-09-15 03:16:23.702281", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:23.733249", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.03069307468831539, "timestamp": "2025-09-15 03:16:23.735035", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:23.766431", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.026559680700302124, "timestamp": "2025-09-15 03:16:23.768463", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:23.800597", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.010563480667769909, "timestamp": "2025-09-15 03:16:23.803675", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:23.835335", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.014871816150844097, "timestamp": "2025-09-15 03:16:23.858918", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:23.890289", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.0237137358635664, "timestamp": "2025-09-15 03:16:23.892517", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:23.922951", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.00401802035048604, "timestamp": "2025-09-15 03:16:23.926668", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:23.958168", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.011716380715370178, "timestamp": "2025-09-15 03:16:23.960693", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:23.992109", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.0034951730631291866, "timestamp": "2025-09-15 03:16:24.016560", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:24.049158", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.013428243808448315, "timestamp": "2025-09-15 03:16:24.051200", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:24.081795", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.018913377076387405, "timestamp": "2025-09-15 03:16:24.085679", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:24.116421", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.008952634409070015, "timestamp": "2025-09-15 03:16:24.122736", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:24.153800", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.010391423478722572, "timestamp": "2025-09-15 03:16:24.178636", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:24.215531", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.033783040940761566, "timestamp": "2025-09-15 03:16:24.219507", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:24.256786", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.018627535551786423, "timestamp": "2025-09-15 03:16:24.259212", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:24.289674", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.022923845797777176, "timestamp": "2025-09-15 03:16:24.293451", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:24.323991", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.014477974735200405, "timestamp": "2025-09-15 03:16:24.347530", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:24.379793", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.010905615985393524, "timestamp": "2025-09-15 03:16:24.384797", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:24.424449", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.015100879594683647, "timestamp": "2025-09-15 03:16:24.431849", "step": 546, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:25.056448", "step": 546, "epoch": 1 }, { "type": "pplx", "content": 89409619.26869535, "timestamp": "2025-09-15 03:16:25.058542", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:25.094970", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.019579365849494934, "timestamp": "2025-09-15 03:16:25.100875", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.131972", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.014669536612927914, "timestamp": "2025-09-15 03:16:25.155773", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.186873", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.007602573838084936, "timestamp": "2025-09-15 03:16:25.189133", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:25.222238", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.01011333055794239, "timestamp": "2025-09-15 03:16:25.224225", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.254253", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.008165261708199978, "timestamp": "2025-09-15 03:16:25.256283", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:25.287103", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.011216294951736927, "timestamp": "2025-09-15 03:16:25.314768", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:25.346143", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.016402242705225945, "timestamp": "2025-09-15 03:16:25.348227", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:25.379545", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.012566662393510342, "timestamp": "2025-09-15 03:16:25.382634", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:25.413335", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.010999364778399467, "timestamp": "2025-09-15 03:16:25.415460", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:25.447215", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.01768830046057701, "timestamp": "2025-09-15 03:16:25.472171", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:25.509057", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.010185576975345612, "timestamp": "2025-09-15 03:16:25.511060", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:25.543175", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.008643836714327335, "timestamp": "2025-09-15 03:16:25.547164", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:25.592428", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.010841822251677513, "timestamp": "2025-09-15 03:16:25.594492", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:25.625665", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.011339940130710602, "timestamp": "2025-09-15 03:16:25.650238", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:25.681804", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.021963687613606453, "timestamp": "2025-09-15 03:16:25.685664", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:25.716629", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.007223943714052439, "timestamp": "2025-09-15 03:16:25.722988", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.755056", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.020964499562978745, "timestamp": "2025-09-15 03:16:25.757256", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:25.788135", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.011233599856495857, "timestamp": "2025-09-15 03:16:25.812979", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:25.845622", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.010387702845036983, "timestamp": "2025-09-15 03:16:25.849734", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.880895", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.0083076860755682, "timestamp": "2025-09-15 03:16:25.882878", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:25.913724", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.008273184299468994, "timestamp": "2025-09-15 03:16:25.915818", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:25.947138", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.029909178614616394, "timestamp": "2025-09-15 03:16:25.974634", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:26.005839", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.002194339642301202, "timestamp": "2025-09-15 03:16:26.007911", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:26.038846", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.01601494289934635, "timestamp": "2025-09-15 03:16:26.040877", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:26.074163", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.0036689683329313993, "timestamp": "2025-09-15 03:16:26.080349", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:26.111727", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.004763443022966385, "timestamp": "2025-09-15 03:16:26.136385", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:26.166662", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.017256123945116997, "timestamp": "2025-09-15 03:16:26.168681", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:26.200094", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.0017653962131589651, "timestamp": "2025-09-15 03:16:26.206329", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:26.237036", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.0193465743213892, "timestamp": "2025-09-15 03:16:26.244267", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:26.276743", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.004393164534121752, "timestamp": "2025-09-15 03:16:26.300332", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:26.330817", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.0037941287737339735, "timestamp": "2025-09-15 03:16:26.332938", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:26.363896", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.0058485642075538635, "timestamp": "2025-09-15 03:16:26.367504", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:26.398604", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.025454718619585037, "timestamp": "2025-09-15 03:16:26.405155", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:26.435732", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.007020084653049707, "timestamp": "2025-09-15 03:16:26.460490", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:26.490814", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.009553035721182823, "timestamp": "2025-09-15 03:16:26.492897", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:26.523780", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.008278349414467812, "timestamp": "2025-09-15 03:16:26.531010", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:26.562056", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.0093050766736269, "timestamp": "2025-09-15 03:16:26.565876", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:26.596585", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.010887989774346352, "timestamp": "2025-09-15 03:16:26.621409", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:26.652519", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.03535384312272072, "timestamp": "2025-09-15 03:16:26.654693", "step": 585, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:27.242879", "step": 585, "epoch": 1 }, { "type": "pplx", "content": 98861864.54475525, "timestamp": "2025-09-15 03:16:27.245026", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:27.274538", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.004063038155436516, "timestamp": "2025-09-15 03:16:27.276485", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:27.307699", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.010096131823956966, "timestamp": "2025-09-15 03:16:27.311677", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:27.342110", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.03615206480026245, "timestamp": "2025-09-15 03:16:27.365904", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:27.397509", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.011354520916938782, "timestamp": "2025-09-15 03:16:27.399931", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:27.431176", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.028571562841534615, "timestamp": "2025-09-15 03:16:27.433426", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:27.464686", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.02120061218738556, "timestamp": "2025-09-15 03:16:27.468582", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:27.499303", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.017664339393377304, "timestamp": "2025-09-15 03:16:27.526697", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:27.567770", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.024698514491319656, "timestamp": "2025-09-15 03:16:27.569993", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:27.600896", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.017938822507858276, "timestamp": "2025-09-15 03:16:27.603229", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:27.636026", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.021319078281521797, "timestamp": "2025-09-15 03:16:27.639547", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:27.670627", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.016344984993338585, "timestamp": "2025-09-15 03:16:27.694665", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:27.729956", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.03623601421713829, "timestamp": "2025-09-15 03:16:27.732064", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:27.762421", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.008179962635040283, "timestamp": "2025-09-15 03:16:27.765476", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:27.808792", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.04286444932222366, "timestamp": "2025-09-15 03:16:27.810768", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:27.841669", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.002624726388603449, "timestamp": "2025-09-15 03:16:27.869981", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:27.901387", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.020307507365942, "timestamp": "2025-09-15 03:16:27.903524", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:27.934720", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.011326144449412823, "timestamp": "2025-09-15 03:16:27.940844", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:27.971864", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.014740855433046818, "timestamp": "2025-09-15 03:16:27.974138", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:28.004842", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.005876082926988602, "timestamp": "2025-09-15 03:16:28.029876", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:28.072400", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.020726444199681282, "timestamp": "2025-09-15 03:16:28.075612", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:28.109917", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.0367356613278389, "timestamp": "2025-09-15 03:16:28.112013", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:28.143701", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.04382667690515518, "timestamp": "2025-09-15 03:16:28.145851", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:28.176470", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.01624279096722603, "timestamp": "2025-09-15 03:16:28.200157", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:28.235642", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.033406250178813934, "timestamp": "2025-09-15 03:16:28.241303", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:28.273855", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.008596173487603664, "timestamp": "2025-09-15 03:16:28.276174", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:28.306420", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.015879623591899872, "timestamp": "2025-09-15 03:16:28.311014", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:28.341507", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.00993258785456419, "timestamp": "2025-09-15 03:16:28.369275", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:28.401614", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.016361379995942116, "timestamp": "2025-09-15 03:16:28.405897", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:28.436227", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.015362887643277645, "timestamp": "2025-09-15 03:16:28.438569", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:28.468674", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.016249025240540504, "timestamp": "2025-09-15 03:16:28.475849", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:28.509620", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.04643365740776062, "timestamp": "2025-09-15 03:16:28.533483", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:28.568172", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.023818328976631165, "timestamp": "2025-09-15 03:16:28.572843", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:28.604918", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.02747488021850586, "timestamp": "2025-09-15 03:16:28.607219", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:28.639406", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.01822894625365734, "timestamp": "2025-09-15 03:16:28.644272", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:28.674899", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.011014273390173912, "timestamp": "2025-09-15 03:16:28.700046", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:28.730674", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.018967941403388977, "timestamp": "2025-09-15 03:16:28.732984", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:28.763784", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.003352448111400008, "timestamp": "2025-09-15 03:16:28.770669", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-15 03:16:28.807097", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.02111704833805561, "timestamp": "2025-09-15 03:16:28.809129", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:28.857756", "step": 623, "epoch": 2 }, { "type": "loss", "content": 0.015314899384975433, "timestamp": "2025-09-15 03:16:28.881610", "step": 624, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:29.781314", "step": 624, "epoch": 2 }, { "type": "pplx", "content": 82289978.05812527, "timestamp": "2025-09-15 03:16:29.783394", "step": 624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:29.828336", "step": 624, "epoch": 2 }, { "type": "loss", "content": 0.02048143930733204, "timestamp": "2025-09-15 03:16:29.830776", "step": 625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:29.894938", "step": 625, "epoch": 2 }, { "type": "loss", "content": 0.009616588242352009, "timestamp": "2025-09-15 03:16:29.899160", "step": 626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:29.929697", "step": 626, "epoch": 2 }, { "type": "loss", "content": 0.008175384253263474, "timestamp": "2025-09-15 03:16:29.934138", "step": 627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:29.966024", "step": 627, "epoch": 2 }, { "type": "loss", "content": 0.003119382541626692, "timestamp": "2025-09-15 03:16:29.994430", "step": 628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:30.026722", "step": 628, "epoch": 2 }, { "type": "loss", "content": 0.013358536176383495, "timestamp": "2025-09-15 03:16:30.028860", "step": 629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:30.062870", "step": 629, "epoch": 2 }, { "type": "loss", "content": 0.011693683452904224, "timestamp": "2025-09-15 03:16:30.064956", "step": 630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:30.110879", "step": 630, "epoch": 2 }, { "type": "loss", "content": 0.03411411494016647, "timestamp": "2025-09-15 03:16:30.115525", "step": 631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:30.148633", "step": 631, "epoch": 2 }, { "type": "loss", "content": 0.009591693058609962, "timestamp": "2025-09-15 03:16:30.172452", "step": 632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:30.215888", "step": 632, "epoch": 2 }, { "type": "loss", "content": 0.012655379250645638, "timestamp": "2025-09-15 03:16:30.217968", "step": 633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:30.258789", "step": 633, "epoch": 2 }, { "type": "loss", "content": 0.04086309298872948, "timestamp": "2025-09-15 03:16:30.266120", "step": 634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:30.318110", "step": 634, "epoch": 2 }, { "type": "loss", "content": 0.009325115010142326, "timestamp": "2025-09-15 03:16:30.322378", "step": 635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:30.389630", "step": 635, "epoch": 2 }, { "type": "loss", "content": 0.007273114286363125, "timestamp": "2025-09-15 03:16:30.413627", "step": 636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:30.460387", "step": 636, "epoch": 2 }, { "type": "loss", "content": 0.025864509865641594, "timestamp": "2025-09-15 03:16:30.462396", "step": 637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:30.508504", "step": 637, "epoch": 2 }, { "type": "loss", "content": 0.021363092586398125, "timestamp": "2025-09-15 03:16:30.513055", "step": 638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:30.552361", "step": 638, "epoch": 2 }, { "type": "loss", "content": 0.02703591249883175, "timestamp": "2025-09-15 03:16:30.555609", "step": 639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:30.597540", "step": 639, "epoch": 2 }, { "type": "loss", "content": 0.017300015315413475, "timestamp": "2025-09-15 03:16:30.622644", "step": 640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:30.692036", "step": 640, "epoch": 2 }, { "type": "loss", "content": 0.009818075224757195, "timestamp": "2025-09-15 03:16:30.696861", "step": 641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:30.738126", "step": 641, "epoch": 2 }, { "type": "loss", "content": 0.01826951466500759, "timestamp": "2025-09-15 03:16:30.740509", "step": 642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:30.787648", "step": 642, "epoch": 2 }, { "type": "loss", "content": 0.01782812550663948, "timestamp": "2025-09-15 03:16:30.794983", "step": 643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:30.843831", "step": 643, "epoch": 2 }, { "type": "loss", "content": 0.017956728115677834, "timestamp": "2025-09-15 03:16:30.872155", "step": 644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:30.920502", "step": 644, "epoch": 2 }, { "type": "loss", "content": 0.01835721917450428, "timestamp": "2025-09-15 03:16:30.922427", "step": 645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:30.974228", "step": 645, "epoch": 2 }, { "type": "loss", "content": 0.023359332233667374, "timestamp": "2025-09-15 03:16:30.976342", "step": 646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:31.035373", "step": 646, "epoch": 2 }, { "type": "loss", "content": 0.03923198580741882, "timestamp": "2025-09-15 03:16:31.040605", "step": 647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:31.083255", "step": 647, "epoch": 2 }, { "type": "loss", "content": 0.005954307969659567, "timestamp": "2025-09-15 03:16:31.111450", "step": 648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:31.151549", "step": 648, "epoch": 2 }, { "type": "loss", "content": 0.006974720861762762, "timestamp": "2025-09-15 03:16:31.157511", "step": 649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:31.190714", "step": 649, "epoch": 2 }, { "type": "loss", "content": 0.03683912381529808, "timestamp": "2025-09-15 03:16:31.195443", "step": 650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:31.226021", "step": 650, "epoch": 2 }, { "type": "loss", "content": 0.027897637337446213, "timestamp": "2025-09-15 03:16:31.228874", "step": 651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:31.259785", "step": 651, "epoch": 2 }, { "type": "loss", "content": 0.012562957592308521, "timestamp": "2025-09-15 03:16:31.288342", "step": 652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:31.318162", "step": 652, "epoch": 2 }, { "type": "loss", "content": 0.007003194652497768, "timestamp": "2025-09-15 03:16:31.320261", "step": 653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:31.352551", "step": 653, "epoch": 2 }, { "type": "loss", "content": 0.00510317413136363, "timestamp": "2025-09-15 03:16:31.357365", "step": 654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:31.387314", "step": 654, "epoch": 2 }, { "type": "loss", "content": 0.020198477432131767, "timestamp": "2025-09-15 03:16:31.389518", "step": 655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:31.422067", "step": 655, "epoch": 2 }, { "type": "loss", "content": 0.016794826835393906, "timestamp": "2025-09-15 03:16:31.450534", "step": 656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:31.480399", "step": 656, "epoch": 2 }, { "type": "loss", "content": 0.02333107963204384, "timestamp": "2025-09-15 03:16:31.485657", "step": 657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:31.516024", "step": 657, "epoch": 2 }, { "type": "loss", "content": 0.013125887140631676, "timestamp": "2025-09-15 03:16:31.521153", "step": 658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:31.551427", "step": 658, "epoch": 2 }, { "type": "loss", "content": 0.011822559870779514, "timestamp": "2025-09-15 03:16:31.554582", "step": 659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:31.584092", "step": 659, "epoch": 2 }, { "type": "loss", "content": 0.018171224743127823, "timestamp": "2025-09-15 03:16:31.607707", "step": 660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:31.637972", "step": 660, "epoch": 2 }, { "type": "loss", "content": 0.017307527363300323, "timestamp": "2025-09-15 03:16:31.640046", "step": 661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:31.669186", "step": 661, "epoch": 2 }, { "type": "loss", "content": 0.01217662263661623, "timestamp": "2025-09-15 03:16:31.671433", "step": 662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:31.701554", "step": 662, "epoch": 2 }, { "type": "loss", "content": 0.015428896062076092, "timestamp": "2025-09-15 03:16:31.707209", "step": 663, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:32.293684", "step": 663, "epoch": 2 }, { "type": "pplx", "content": 75053295.7847995, "timestamp": "2025-09-15 03:16:32.295602", "step": 663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:32.324065", "step": 663, "epoch": 2 }, { "type": "loss", "content": 0.01401414256542921, "timestamp": "2025-09-15 03:16:32.352081", "step": 664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:32.382447", "step": 664, "epoch": 2 }, { "type": "loss", "content": 0.025133652612566948, "timestamp": "2025-09-15 03:16:32.385035", "step": 665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:32.414441", "step": 665, "epoch": 2 }, { "type": "loss", "content": 0.010838499292731285, "timestamp": "2025-09-15 03:16:32.417702", "step": 666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:32.447082", "step": 666, "epoch": 2 }, { "type": "loss", "content": 0.04639563336968422, "timestamp": "2025-09-15 03:16:32.454729", "step": 667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:32.484473", "step": 667, "epoch": 2 }, { "type": "loss", "content": 0.006913153920322657, "timestamp": "2025-09-15 03:16:32.507752", "step": 668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:32.537939", "step": 668, "epoch": 2 }, { "type": "loss", "content": 0.01731266640126705, "timestamp": "2025-09-15 03:16:32.540688", "step": 669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:32.570215", "step": 669, "epoch": 2 }, { "type": "loss", "content": 0.021937739104032516, "timestamp": "2025-09-15 03:16:32.571954", "step": 670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:32.600976", "step": 670, "epoch": 2 }, { "type": "loss", "content": 0.027854878455400467, "timestamp": "2025-09-15 03:16:32.602817", "step": 671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:32.631921", "step": 671, "epoch": 2 }, { "type": "loss", "content": 0.022487761452794075, "timestamp": "2025-09-15 03:16:32.655646", "step": 672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:32.684848", "step": 672, "epoch": 2 }, { "type": "loss", "content": 0.01178536843508482, "timestamp": "2025-09-15 03:16:32.687401", "step": 673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:32.717121", "step": 673, "epoch": 2 }, { "type": "loss", "content": 0.02284214086830616, "timestamp": "2025-09-15 03:16:32.718899", "step": 674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:32.747878", "step": 674, "epoch": 2 }, { "type": "loss", "content": 0.004044529981911182, "timestamp": "2025-09-15 03:16:32.750692", "step": 675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:32.779566", "step": 675, "epoch": 2 }, { "type": "loss", "content": 0.028430771082639694, "timestamp": "2025-09-15 03:16:32.802856", "step": 676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:32.836214", "step": 676, "epoch": 2 }, { "type": "loss", "content": 0.009200649335980415, "timestamp": "2025-09-15 03:16:32.837894", "step": 677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:32.871695", "step": 677, "epoch": 2 }, { "type": "loss", "content": 0.015377013944089413, "timestamp": "2025-09-15 03:16:32.876501", "step": 678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:32.906114", "step": 678, "epoch": 2 }, { "type": "loss", "content": 0.02052604779601097, "timestamp": "2025-09-15 03:16:32.908188", "step": 679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:32.937695", "step": 679, "epoch": 2 }, { "type": "loss", "content": 0.015673715621232986, "timestamp": "2025-09-15 03:16:32.961332", "step": 680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:32.990713", "step": 680, "epoch": 2 }, { "type": "loss", "content": 0.02544725500047207, "timestamp": "2025-09-15 03:16:32.992609", "step": 681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:33.022312", "step": 681, "epoch": 2 }, { "type": "loss", "content": 0.016639648005366325, "timestamp": "2025-09-15 03:16:33.025334", "step": 682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:33.054557", "step": 682, "epoch": 2 }, { "type": "loss", "content": 0.020371485501527786, "timestamp": "2025-09-15 03:16:33.056748", "step": 683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:33.087222", "step": 683, "epoch": 2 }, { "type": "loss", "content": 0.012767528183758259, "timestamp": "2025-09-15 03:16:33.111045", "step": 684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:33.140413", "step": 684, "epoch": 2 }, { "type": "loss", "content": 0.01824815571308136, "timestamp": "2025-09-15 03:16:33.142737", "step": 685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:33.172116", "step": 685, "epoch": 2 }, { "type": "loss", "content": 0.009122505784034729, "timestamp": "2025-09-15 03:16:33.175010", "step": 686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:33.204797", "step": 686, "epoch": 2 }, { "type": "loss", "content": 0.01623712293803692, "timestamp": "2025-09-15 03:16:33.209842", "step": 687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:33.239531", "step": 687, "epoch": 2 }, { "type": "loss", "content": 0.01666615717113018, "timestamp": "2025-09-15 03:16:33.265187", "step": 688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:33.293849", "step": 688, "epoch": 2 }, { "type": "loss", "content": 0.0204975213855505, "timestamp": "2025-09-15 03:16:33.295799", "step": 689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:33.325298", "step": 689, "epoch": 2 }, { "type": "loss", "content": 0.011426101438701153, "timestamp": "2025-09-15 03:16:33.335610", "step": 690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:33.366510", "step": 690, "epoch": 2 }, { "type": "loss", "content": 0.01042362954467535, "timestamp": "2025-09-15 03:16:33.368838", "step": 691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:33.398837", "step": 691, "epoch": 2 }, { "type": "loss", "content": 0.028445009142160416, "timestamp": "2025-09-15 03:16:33.424469", "step": 692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:33.454157", "step": 692, "epoch": 2 }, { "type": "loss", "content": 0.011215626262128353, "timestamp": "2025-09-15 03:16:33.456148", "step": 693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:33.485775", "step": 693, "epoch": 2 }, { "type": "loss", "content": 0.012956084683537483, "timestamp": "2025-09-15 03:16:33.487973", "step": 694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:33.517389", "step": 694, "epoch": 2 }, { "type": "loss", "content": 0.012555070221424103, "timestamp": "2025-09-15 03:16:33.519670", "step": 695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:33.549477", "step": 695, "epoch": 2 }, { "type": "loss", "content": 0.012987296096980572, "timestamp": "2025-09-15 03:16:33.572674", "step": 696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:33.601684", "step": 696, "epoch": 2 }, { "type": "loss", "content": 0.009055888280272484, "timestamp": "2025-09-15 03:16:33.603746", "step": 697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:33.633132", "step": 697, "epoch": 2 }, { "type": "loss", "content": 0.003815798321738839, "timestamp": "2025-09-15 03:16:33.634950", "step": 698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:33.664742", "step": 698, "epoch": 2 }, { "type": "loss", "content": 0.004667258355766535, "timestamp": "2025-09-15 03:16:33.669761", "step": 699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:33.698956", "step": 699, "epoch": 2 }, { "type": "loss", "content": 0.017840759828686714, "timestamp": "2025-09-15 03:16:33.723232", "step": 700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:33.752626", "step": 700, "epoch": 2 }, { "type": "loss", "content": 0.015343495644629002, "timestamp": "2025-09-15 03:16:33.757678", "step": 701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:33.788145", "step": 701, "epoch": 2 }, { "type": "loss", "content": 0.020067986100912094, "timestamp": "2025-09-15 03:16:33.796127", "step": 702, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:34.379492", "step": 702, "epoch": 2 }, { "type": "pplx", "content": 77832797.15443674, "timestamp": "2025-09-15 03:16:34.381487", "step": 702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:34.409799", "step": 702, "epoch": 2 }, { "type": "loss", "content": 0.006786287762224674, "timestamp": "2025-09-15 03:16:34.414257", "step": 703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:34.443684", "step": 703, "epoch": 2 }, { "type": "loss", "content": 0.007841740734875202, "timestamp": "2025-09-15 03:16:34.471957", "step": 704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:34.501439", "step": 704, "epoch": 2 }, { "type": "loss", "content": 0.006768788676708937, "timestamp": "2025-09-15 03:16:34.504126", "step": 705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:34.534114", "step": 705, "epoch": 2 }, { "type": "loss", "content": 0.013998471200466156, "timestamp": "2025-09-15 03:16:34.542119", "step": 706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:34.572814", "step": 706, "epoch": 2 }, { "type": "loss", "content": 0.016503024846315384, "timestamp": "2025-09-15 03:16:34.577344", "step": 707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:34.607323", "step": 707, "epoch": 2 }, { "type": "loss", "content": 0.0034505471121519804, "timestamp": "2025-09-15 03:16:34.635888", "step": 708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:34.664818", "step": 708, "epoch": 2 }, { "type": "loss", "content": 0.005293605383485556, "timestamp": "2025-09-15 03:16:34.666822", "step": 709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:34.696327", "step": 709, "epoch": 2 }, { "type": "loss", "content": 0.01751369796693325, "timestamp": "2025-09-15 03:16:34.704308", "step": 710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:34.733767", "step": 710, "epoch": 2 }, { "type": "loss", "content": 0.0058233654126524925, "timestamp": "2025-09-15 03:16:34.735421", "step": 711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:34.765340", "step": 711, "epoch": 2 }, { "type": "loss", "content": 0.01761193387210369, "timestamp": "2025-09-15 03:16:34.793544", "step": 712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:34.822857", "step": 712, "epoch": 2 }, { "type": "loss", "content": 0.008471434004604816, "timestamp": "2025-09-15 03:16:34.826186", "step": 713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:34.856873", "step": 713, "epoch": 2 }, { "type": "loss", "content": 0.010182567872107029, "timestamp": "2025-09-15 03:16:34.860710", "step": 714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:34.893589", "step": 714, "epoch": 2 }, { "type": "loss", "content": 0.010621651075780392, "timestamp": "2025-09-15 03:16:34.898591", "step": 715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:34.928577", "step": 715, "epoch": 2 }, { "type": "loss", "content": 0.026854127645492554, "timestamp": "2025-09-15 03:16:34.954121", "step": 716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:34.984982", "step": 716, "epoch": 2 }, { "type": "loss", "content": 0.014122831635177135, "timestamp": "2025-09-15 03:16:34.987777", "step": 717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:35.017197", "step": 717, "epoch": 2 }, { "type": "loss", "content": 0.006635740399360657, "timestamp": "2025-09-15 03:16:35.022246", "step": 718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:35.052408", "step": 718, "epoch": 2 }, { "type": "loss", "content": 0.025449639186263084, "timestamp": "2025-09-15 03:16:35.057072", "step": 719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:35.087518", "step": 719, "epoch": 2 }, { "type": "loss", "content": 0.012080073356628418, "timestamp": "2025-09-15 03:16:35.110634", "step": 720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:35.138838", "step": 720, "epoch": 2 }, { "type": "loss", "content": 0.025123355910182, "timestamp": "2025-09-15 03:16:35.140986", "step": 721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:35.170557", "step": 721, "epoch": 2 }, { "type": "loss", "content": 0.0044836062006652355, "timestamp": "2025-09-15 03:16:35.178670", "step": 722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:35.208096", "step": 722, "epoch": 2 }, { "type": "loss", "content": 0.0019570142030715942, "timestamp": "2025-09-15 03:16:35.211255", "step": 723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:35.240586", "step": 723, "epoch": 2 }, { "type": "loss", "content": 0.005114336032420397, "timestamp": "2025-09-15 03:16:35.268864", "step": 724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:35.298475", "step": 724, "epoch": 2 }, { "type": "loss", "content": 0.0038301029708236456, "timestamp": "2025-09-15 03:16:35.301247", "step": 725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:35.330890", "step": 725, "epoch": 2 }, { "type": "loss", "content": 0.01759827882051468, "timestamp": "2025-09-15 03:16:35.334012", "step": 726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:35.363990", "step": 726, "epoch": 2 }, { "type": "loss", "content": 0.012181101366877556, "timestamp": "2025-09-15 03:16:35.369008", "step": 727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:35.400083", "step": 727, "epoch": 2 }, { "type": "loss", "content": 0.004602841567248106, "timestamp": "2025-09-15 03:16:35.429385", "step": 728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:35.458710", "step": 728, "epoch": 2 }, { "type": "loss", "content": 0.00435149110853672, "timestamp": "2025-09-15 03:16:35.460902", "step": 729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:35.492071", "step": 729, "epoch": 2 }, { "type": "loss", "content": 0.006036927457898855, "timestamp": "2025-09-15 03:16:35.496762", "step": 730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:35.525715", "step": 730, "epoch": 2 }, { "type": "loss", "content": 0.005526290275156498, "timestamp": "2025-09-15 03:16:35.527628", "step": 731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:35.556538", "step": 731, "epoch": 2 }, { "type": "loss", "content": 0.018071789294481277, "timestamp": "2025-09-15 03:16:35.579581", "step": 732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:35.608987", "step": 732, "epoch": 2 }, { "type": "loss", "content": 0.012418746016919613, "timestamp": "2025-09-15 03:16:35.611290", "step": 733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:35.640967", "step": 733, "epoch": 2 }, { "type": "loss", "content": 0.017110273241996765, "timestamp": "2025-09-15 03:16:35.646041", "step": 734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:35.676386", "step": 734, "epoch": 2 }, { "type": "loss", "content": 0.0071751694194972515, "timestamp": "2025-09-15 03:16:35.678099", "step": 735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:35.707546", "step": 735, "epoch": 2 }, { "type": "loss", "content": 0.008484402671456337, "timestamp": "2025-09-15 03:16:35.733162", "step": 736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:35.763879", "step": 736, "epoch": 2 }, { "type": "loss", "content": 0.018988804891705513, "timestamp": "2025-09-15 03:16:35.769527", "step": 737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:35.799353", "step": 737, "epoch": 2 }, { "type": "loss", "content": 0.013006098568439484, "timestamp": "2025-09-15 03:16:35.801249", "step": 738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:35.831004", "step": 738, "epoch": 2 }, { "type": "loss", "content": 0.005592212080955505, "timestamp": "2025-09-15 03:16:35.836027", "step": 739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:35.865638", "step": 739, "epoch": 2 }, { "type": "loss", "content": 0.0024918068666011095, "timestamp": "2025-09-15 03:16:35.890307", "step": 740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:35.920453", "step": 740, "epoch": 2 }, { "type": "loss", "content": 0.01893484592437744, "timestamp": "2025-09-15 03:16:35.925465", "step": 741, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:36.509456", "step": 741, "epoch": 2 }, { "type": "pplx", "content": 91392949.23677413, "timestamp": "2025-09-15 03:16:36.511155", "step": 741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:36.539681", "step": 741, "epoch": 2 }, { "type": "loss", "content": 0.006453401874750853, "timestamp": "2025-09-15 03:16:36.543957", "step": 742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-15 03:16:36.573874", "step": 742, "epoch": 2 }, { "type": "loss", "content": 0.011112798936665058, "timestamp": "2025-09-15 03:16:36.584820", "step": 743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:36.615801", "step": 743, "epoch": 2 }, { "type": "loss", "content": 0.004536872263997793, "timestamp": "2025-09-15 03:16:36.641339", "step": 744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:36.671119", "step": 744, "epoch": 2 }, { "type": "loss", "content": 0.0465034581720829, "timestamp": "2025-09-15 03:16:36.673193", "step": 745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:36.702738", "step": 745, "epoch": 2 }, { "type": "loss", "content": 0.03564067929983139, "timestamp": "2025-09-15 03:16:36.707647", "step": 746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:36.737894", "step": 746, "epoch": 2 }, { "type": "loss", "content": 0.01765831746160984, "timestamp": "2025-09-15 03:16:36.744866", "step": 747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:36.774711", "step": 747, "epoch": 2 }, { "type": "loss", "content": 0.017794979736208916, "timestamp": "2025-09-15 03:16:36.798376", "step": 748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:36.828621", "step": 748, "epoch": 2 }, { "type": "loss", "content": 0.002160144504159689, "timestamp": "2025-09-15 03:16:36.830790", "step": 749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:36.861174", "step": 749, "epoch": 2 }, { "type": "loss", "content": 0.0007071318104863167, "timestamp": "2025-09-15 03:16:36.863311", "step": 750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:36.893618", "step": 750, "epoch": 2 }, { "type": "loss", "content": 0.0015173929277807474, "timestamp": "2025-09-15 03:16:36.895853", "step": 751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:36.926094", "step": 751, "epoch": 2 }, { "type": "loss", "content": 0.005006411578506231, "timestamp": "2025-09-15 03:16:36.951636", "step": 752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:36.981783", "step": 752, "epoch": 2 }, { "type": "loss", "content": 0.008088169619441032, "timestamp": "2025-09-15 03:16:36.989623", "step": 753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.020847", "step": 753, "epoch": 2 }, { "type": "loss", "content": 0.0008037859806790948, "timestamp": "2025-09-15 03:16:37.024945", "step": 754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:37.054851", "step": 754, "epoch": 2 }, { "type": "loss", "content": 0.001742966240271926, "timestamp": "2025-09-15 03:16:37.056939", "step": 755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.087156", "step": 755, "epoch": 2 }, { "type": "loss", "content": 0.00438849488273263, "timestamp": "2025-09-15 03:16:37.112369", "step": 756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:37.142793", "step": 756, "epoch": 2 }, { "type": "loss", "content": 0.008937621489167213, "timestamp": "2025-09-15 03:16:37.147315", "step": 757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:37.177595", "step": 757, "epoch": 2 }, { "type": "loss", "content": 0.05840831249952316, "timestamp": "2025-09-15 03:16:37.185106", "step": 758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:37.215954", "step": 758, "epoch": 2 }, { "type": "loss", "content": 0.0005601375596597791, "timestamp": "2025-09-15 03:16:37.220617", "step": 759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:37.250156", "step": 759, "epoch": 2 }, { "type": "loss", "content": 0.011043830774724483, "timestamp": "2025-09-15 03:16:37.273773", "step": 760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:37.303774", "step": 760, "epoch": 2 }, { "type": "loss", "content": 0.0023872568272054195, "timestamp": "2025-09-15 03:16:37.305986", "step": 761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:37.336645", "step": 761, "epoch": 2 }, { "type": "loss", "content": 0.06608807295560837, "timestamp": "2025-09-15 03:16:37.341189", "step": 762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:37.370848", "step": 762, "epoch": 2 }, { "type": "loss", "content": 0.0007567157153971493, "timestamp": "2025-09-15 03:16:37.373388", "step": 763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:37.403097", "step": 763, "epoch": 2 }, { "type": "loss", "content": 0.0014279234455898404, "timestamp": "2025-09-15 03:16:37.426664", "step": 764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:37.458231", "step": 764, "epoch": 2 }, { "type": "loss", "content": 0.01821976900100708, "timestamp": "2025-09-15 03:16:37.463548", "step": 765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:37.493815", "step": 765, "epoch": 2 }, { "type": "loss", "content": 0.0018954897532239556, "timestamp": "2025-09-15 03:16:37.495984", "step": 766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:37.527727", "step": 766, "epoch": 2 }, { "type": "loss", "content": 0.0035364211071282625, "timestamp": "2025-09-15 03:16:37.529820", "step": 767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:37.559701", "step": 767, "epoch": 2 }, { "type": "loss", "content": 0.028634265065193176, "timestamp": "2025-09-15 03:16:37.583279", "step": 768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:37.612862", "step": 768, "epoch": 2 }, { "type": "loss", "content": 0.011222733184695244, "timestamp": "2025-09-15 03:16:37.614870", "step": 769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:37.645347", "step": 769, "epoch": 2 }, { "type": "loss", "content": 0.004136077128350735, "timestamp": "2025-09-15 03:16:37.647414", "step": 770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.677605", "step": 770, "epoch": 2 }, { "type": "loss", "content": 0.005994807463139296, "timestamp": "2025-09-15 03:16:37.681907", "step": 771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.712243", "step": 771, "epoch": 2 }, { "type": "loss", "content": 0.014044421724975109, "timestamp": "2025-09-15 03:16:37.737562", "step": 772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:37.767627", "step": 772, "epoch": 2 }, { "type": "loss", "content": 0.044583700597286224, "timestamp": "2025-09-15 03:16:37.769968", "step": 773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:37.799796", "step": 773, "epoch": 2 }, { "type": "loss", "content": 0.0017988347681239247, "timestamp": "2025-09-15 03:16:37.801741", "step": 774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:37.833531", "step": 774, "epoch": 2 }, { "type": "loss", "content": 0.02564374729990959, "timestamp": "2025-09-15 03:16:37.835743", "step": 775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:37.865814", "step": 775, "epoch": 2 }, { "type": "loss", "content": 0.01314005721360445, "timestamp": "2025-09-15 03:16:37.889695", "step": 776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.920049", "step": 776, "epoch": 2 }, { "type": "loss", "content": 0.0391593798995018, "timestamp": "2025-09-15 03:16:37.922170", "step": 777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:37.951649", "step": 777, "epoch": 2 }, { "type": "loss", "content": 0.008872945792973042, "timestamp": "2025-09-15 03:16:37.956011", "step": 778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:37.986077", "step": 778, "epoch": 2 }, { "type": "loss", "content": 0.009254111908376217, "timestamp": "2025-09-15 03:16:37.993209", "step": 779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:38.024701", "step": 779, "epoch": 2 }, { "type": "loss", "content": 0.001388642704114318, "timestamp": "2025-09-15 03:16:38.048402", "step": 780, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:38.637769", "step": 780, "epoch": 2 }, { "type": "pplx", "content": 91475636.10912232, "timestamp": "2025-09-15 03:16:38.639760", "step": 780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:38.668757", "step": 780, "epoch": 2 }, { "type": "loss", "content": 0.05400308594107628, "timestamp": "2025-09-15 03:16:38.670970", "step": 781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:38.700864", "step": 781, "epoch": 2 }, { "type": "loss", "content": 0.029076367616653442, "timestamp": "2025-09-15 03:16:38.704716", "step": 782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:38.735483", "step": 782, "epoch": 2 }, { "type": "loss", "content": 0.015219086781144142, "timestamp": "2025-09-15 03:16:38.742725", "step": 783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:38.772989", "step": 783, "epoch": 2 }, { "type": "loss", "content": 0.01549097616225481, "timestamp": "2025-09-15 03:16:38.798231", "step": 784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:38.828882", "step": 784, "epoch": 2 }, { "type": "loss", "content": 0.006616808939725161, "timestamp": "2025-09-15 03:16:38.830975", "step": 785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:38.861612", "step": 785, "epoch": 2 }, { "type": "loss", "content": 0.02561446838080883, "timestamp": "2025-09-15 03:16:38.869180", "step": 786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:38.899001", "step": 786, "epoch": 2 }, { "type": "loss", "content": 0.002184271113947034, "timestamp": "2025-09-15 03:16:38.903827", "step": 787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:38.936189", "step": 787, "epoch": 2 }, { "type": "loss", "content": 0.004748955834656954, "timestamp": "2025-09-15 03:16:38.959872", "step": 788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:38.990262", "step": 788, "epoch": 2 }, { "type": "loss", "content": 0.015393979847431183, "timestamp": "2025-09-15 03:16:38.992405", "step": 789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.022756", "step": 789, "epoch": 2 }, { "type": "loss", "content": 0.003272005822509527, "timestamp": "2025-09-15 03:16:39.027048", "step": 790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:39.057665", "step": 790, "epoch": 2 }, { "type": "loss", "content": 0.0167753454297781, "timestamp": "2025-09-15 03:16:39.060351", "step": 791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:39.091407", "step": 791, "epoch": 2 }, { "type": "loss", "content": 0.004095498938113451, "timestamp": "2025-09-15 03:16:39.120074", "step": 792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.151834", "step": 792, "epoch": 2 }, { "type": "loss", "content": 0.007157167885452509, "timestamp": "2025-09-15 03:16:39.154300", "step": 793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:39.185580", "step": 793, "epoch": 2 }, { "type": "loss", "content": 0.02066117525100708, "timestamp": "2025-09-15 03:16:39.187572", "step": 794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:39.218973", "step": 794, "epoch": 2 }, { "type": "loss", "content": 0.011057278141379356, "timestamp": "2025-09-15 03:16:39.226137", "step": 795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.256789", "step": 795, "epoch": 2 }, { "type": "loss", "content": 0.017794836312532425, "timestamp": "2025-09-15 03:16:39.282076", "step": 796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:39.311934", "step": 796, "epoch": 2 }, { "type": "loss", "content": 0.044305894523859024, "timestamp": "2025-09-15 03:16:39.314661", "step": 797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:39.345231", "step": 797, "epoch": 2 }, { "type": "loss", "content": 0.010922898538410664, "timestamp": "2025-09-15 03:16:39.349935", "step": 798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:39.380088", "step": 798, "epoch": 2 }, { "type": "loss", "content": 0.0023753507994115353, "timestamp": "2025-09-15 03:16:39.382828", "step": 799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:39.412330", "step": 799, "epoch": 2 }, { "type": "loss", "content": 0.01779024675488472, "timestamp": "2025-09-15 03:16:39.435893", "step": 800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:39.466282", "step": 800, "epoch": 2 }, { "type": "loss", "content": 0.0019487850368022919, "timestamp": "2025-09-15 03:16:39.468504", "step": 801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.498708", "step": 801, "epoch": 2 }, { "type": "loss", "content": 0.004304631147533655, "timestamp": "2025-09-15 03:16:39.502881", "step": 802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:39.532939", "step": 802, "epoch": 2 }, { "type": "loss", "content": 0.007857972756028175, "timestamp": "2025-09-15 03:16:39.534946", "step": 803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.565837", "step": 803, "epoch": 2 }, { "type": "loss", "content": 0.020163623616099358, "timestamp": "2025-09-15 03:16:39.590948", "step": 804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:39.624856", "step": 804, "epoch": 2 }, { "type": "loss", "content": 0.0343315526843071, "timestamp": "2025-09-15 03:16:39.630262", "step": 805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:39.663557", "step": 805, "epoch": 2 }, { "type": "loss", "content": 0.0051241847686469555, "timestamp": "2025-09-15 03:16:39.665993", "step": 806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:39.696291", "step": 806, "epoch": 2 }, { "type": "loss", "content": 0.006621518172323704, "timestamp": "2025-09-15 03:16:39.699258", "step": 807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:39.728546", "step": 807, "epoch": 2 }, { "type": "loss", "content": 0.00918488297611475, "timestamp": "2025-09-15 03:16:39.752161", "step": 808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.782176", "step": 808, "epoch": 2 }, { "type": "loss", "content": 0.0167904794216156, "timestamp": "2025-09-15 03:16:39.784151", "step": 809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:39.813915", "step": 809, "epoch": 2 }, { "type": "loss", "content": 0.012763723731040955, "timestamp": "2025-09-15 03:16:39.816189", "step": 810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:39.848653", "step": 810, "epoch": 2 }, { "type": "loss", "content": 0.006828667130321264, "timestamp": "2025-09-15 03:16:39.855543", "step": 811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.885412", "step": 811, "epoch": 2 }, { "type": "loss", "content": 0.007466200739145279, "timestamp": "2025-09-15 03:16:39.910767", "step": 812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:39.940314", "step": 812, "epoch": 2 }, { "type": "loss", "content": 0.006770141888409853, "timestamp": "2025-09-15 03:16:39.942277", "step": 813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:39.973699", "step": 813, "epoch": 2 }, { "type": "loss", "content": 0.02318437024950981, "timestamp": "2025-09-15 03:16:39.975640", "step": 814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:40.005672", "step": 814, "epoch": 2 }, { "type": "loss", "content": 0.019127046689391136, "timestamp": "2025-09-15 03:16:40.007885", "step": 815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:40.038004", "step": 815, "epoch": 2 }, { "type": "loss", "content": 0.005318149924278259, "timestamp": "2025-09-15 03:16:40.061953", "step": 816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:40.092305", "step": 816, "epoch": 2 }, { "type": "loss", "content": 0.01984100416302681, "timestamp": "2025-09-15 03:16:40.094341", "step": 817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:40.124134", "step": 817, "epoch": 2 }, { "type": "loss", "content": 0.0027165242936462164, "timestamp": "2025-09-15 03:16:40.126186", "step": 818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:40.156080", "step": 818, "epoch": 2 }, { "type": "loss", "content": 0.01571919396519661, "timestamp": "2025-09-15 03:16:40.160558", "step": 819, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:40.747069", "step": 819, "epoch": 2 }, { "type": "pplx", "content": 84134180.46701647, "timestamp": "2025-09-15 03:16:40.748913", "step": 819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:40.777001", "step": 819, "epoch": 2 }, { "type": "loss", "content": 0.010039118118584156, "timestamp": "2025-09-15 03:16:40.802576", "step": 820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:40.832763", "step": 820, "epoch": 2 }, { "type": "loss", "content": 0.0398661270737648, "timestamp": "2025-09-15 03:16:40.835113", "step": 821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:40.865732", "step": 821, "epoch": 2 }, { "type": "loss", "content": 0.0018895015818998218, "timestamp": "2025-09-15 03:16:40.872712", "step": 822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:40.903265", "step": 822, "epoch": 2 }, { "type": "loss", "content": 0.0020522272679954767, "timestamp": "2025-09-15 03:16:40.910259", "step": 823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:40.941710", "step": 823, "epoch": 2 }, { "type": "loss", "content": 0.012462636455893517, "timestamp": "2025-09-15 03:16:40.969691", "step": 824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:41.000101", "step": 824, "epoch": 2 }, { "type": "loss", "content": 0.006202584598213434, "timestamp": "2025-09-15 03:16:41.002186", "step": 825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:41.032496", "step": 825, "epoch": 2 }, { "type": "loss", "content": 0.011295338161289692, "timestamp": "2025-09-15 03:16:41.034785", "step": 826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:41.065068", "step": 826, "epoch": 2 }, { "type": "loss", "content": 0.017415456473827362, "timestamp": "2025-09-15 03:16:41.067053", "step": 827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:41.097289", "step": 827, "epoch": 2 }, { "type": "loss", "content": 0.010938125662505627, "timestamp": "2025-09-15 03:16:41.121007", "step": 828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:41.151259", "step": 828, "epoch": 2 }, { "type": "loss", "content": 0.014517457224428654, "timestamp": "2025-09-15 03:16:41.153430", "step": 829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:41.183394", "step": 829, "epoch": 2 }, { "type": "loss", "content": 0.03378923982381821, "timestamp": "2025-09-15 03:16:41.185440", "step": 830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:41.216218", "step": 830, "epoch": 2 }, { "type": "loss", "content": 0.01933034136891365, "timestamp": "2025-09-15 03:16:41.223339", "step": 831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:41.254027", "step": 831, "epoch": 2 }, { "type": "loss", "content": 0.020713625475764275, "timestamp": "2025-09-15 03:16:41.280342", "step": 832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:41.313891", "step": 832, "epoch": 2 }, { "type": "loss", "content": 0.018430788069963455, "timestamp": "2025-09-15 03:16:41.316133", "step": 833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:41.347729", "step": 833, "epoch": 2 }, { "type": "loss", "content": 0.018924186006188393, "timestamp": "2025-09-15 03:16:41.354328", "step": 834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:41.384082", "step": 834, "epoch": 2 }, { "type": "loss", "content": 0.0052634854800999165, "timestamp": "2025-09-15 03:16:41.386825", "step": 835, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:41.418766", "step": 835, "epoch": 2 }, { "type": "loss", "content": 0.009086108766496181, "timestamp": "2025-09-15 03:16:41.442440", "step": 836, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:41.473513", "step": 836, "epoch": 2 }, { "type": "loss", "content": 0.020214611664414406, "timestamp": "2025-09-15 03:16:41.475468", "step": 837, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:41.505621", "step": 837, "epoch": 2 }, { "type": "loss", "content": 0.008888529613614082, "timestamp": "2025-09-15 03:16:41.507402", "step": 838, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:41.537502", "step": 838, "epoch": 2 }, { "type": "loss", "content": 0.0037065569777041674, "timestamp": "2025-09-15 03:16:41.540440", "step": 839, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:41.570216", "step": 839, "epoch": 2 }, { "type": "loss", "content": 0.0057419161312282085, "timestamp": "2025-09-15 03:16:41.593850", "step": 840, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:41.625229", "step": 840, "epoch": 2 }, { "type": "loss", "content": 0.0032529861200600863, "timestamp": "2025-09-15 03:16:41.630766", "step": 841, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:41.662044", "step": 841, "epoch": 2 }, { "type": "loss", "content": 0.00431677233427763, "timestamp": "2025-09-15 03:16:41.666374", "step": 842, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:41.697554", "step": 842, "epoch": 2 }, { "type": "loss", "content": 0.014750749804079533, "timestamp": "2025-09-15 03:16:41.705250", "step": 843, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:41.736256", "step": 843, "epoch": 2 }, { "type": "loss", "content": 0.01972171850502491, "timestamp": "2025-09-15 03:16:41.761863", "step": 844, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:41.793333", "step": 844, "epoch": 2 }, { "type": "loss", "content": 0.014479391276836395, "timestamp": "2025-09-15 03:16:41.801125", "step": 845, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:41.832344", "step": 845, "epoch": 2 }, { "type": "loss", "content": 0.01149661373347044, "timestamp": "2025-09-15 03:16:41.839362", "step": 846, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:41.869753", "step": 846, "epoch": 2 }, { "type": "loss", "content": 0.014005172066390514, "timestamp": "2025-09-15 03:16:41.876593", "step": 847, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:41.906550", "step": 847, "epoch": 2 }, { "type": "loss", "content": 0.027859285473823547, "timestamp": "2025-09-15 03:16:41.930242", "step": 848, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:41.962208", "step": 848, "epoch": 2 }, { "type": "loss", "content": 0.002458788687363267, "timestamp": "2025-09-15 03:16:41.964298", "step": 849, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:41.995211", "step": 849, "epoch": 2 }, { "type": "loss", "content": 0.03137155622243881, "timestamp": "2025-09-15 03:16:41.999822", "step": 850, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:42.030035", "step": 850, "epoch": 2 }, { "type": "loss", "content": 0.02159748785197735, "timestamp": "2025-09-15 03:16:42.032302", "step": 851, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:42.062775", "step": 851, "epoch": 2 }, { "type": "loss", "content": 0.0020572689827531576, "timestamp": "2025-09-15 03:16:42.090949", "step": 852, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:42.121291", "step": 852, "epoch": 2 }, { "type": "loss", "content": 0.012123345397412777, "timestamp": "2025-09-15 03:16:42.123461", "step": 853, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:42.153335", "step": 853, "epoch": 2 }, { "type": "loss", "content": 0.0106295645236969, "timestamp": "2025-09-15 03:16:42.155664", "step": 854, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:42.186675", "step": 854, "epoch": 2 }, { "type": "loss", "content": 0.003279120195657015, "timestamp": "2025-09-15 03:16:42.194210", "step": 855, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:42.224629", "step": 855, "epoch": 2 }, { "type": "loss", "content": 0.006891821976751089, "timestamp": "2025-09-15 03:16:42.253404", "step": 856, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:42.283132", "step": 856, "epoch": 2 }, { "type": "loss", "content": 0.02520199678838253, "timestamp": "2025-09-15 03:16:42.285755", "step": 857, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:42.316268", "step": 857, "epoch": 2 }, { "type": "loss", "content": 0.03084452822804451, "timestamp": "2025-09-15 03:16:42.318793", "step": 858, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:42.916288", "step": 858, "epoch": 2 }, { "type": "pplx", "content": 87225457.20443432, "timestamp": "2025-09-15 03:16:42.918481", "step": 858, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:42.947505", "step": 858, "epoch": 2 }, { "type": "loss", "content": 0.004284991882741451, "timestamp": "2025-09-15 03:16:42.955023", "step": 859, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:42.986798", "step": 859, "epoch": 2 }, { "type": "loss", "content": 0.03205358237028122, "timestamp": "2025-09-15 03:16:43.012665", "step": 860, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:43.042712", "step": 860, "epoch": 2 }, { "type": "loss", "content": 0.016096081584692, "timestamp": "2025-09-15 03:16:43.045202", "step": 861, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:43.075796", "step": 861, "epoch": 2 }, { "type": "loss", "content": 0.014634288847446442, "timestamp": "2025-09-15 03:16:43.078808", "step": 862, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:43.108878", "step": 862, "epoch": 2 }, { "type": "loss", "content": 0.016217602416872978, "timestamp": "2025-09-15 03:16:43.113669", "step": 863, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:43.144194", "step": 863, "epoch": 2 }, { "type": "loss", "content": 0.008187689818441868, "timestamp": "2025-09-15 03:16:43.172385", "step": 864, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:43.202443", "step": 864, "epoch": 2 }, { "type": "loss", "content": 0.005138729233294725, "timestamp": "2025-09-15 03:16:43.204527", "step": 865, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:43.234718", "step": 865, "epoch": 2 }, { "type": "loss", "content": 0.018294647336006165, "timestamp": "2025-09-15 03:16:43.241946", "step": 866, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:43.272979", "step": 866, "epoch": 2 }, { "type": "loss", "content": 0.010565018281340599, "timestamp": "2025-09-15 03:16:43.277628", "step": 867, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:43.307641", "step": 867, "epoch": 2 }, { "type": "loss", "content": 0.004669521003961563, "timestamp": "2025-09-15 03:16:43.335792", "step": 868, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:43.366278", "step": 868, "epoch": 2 }, { "type": "loss", "content": 0.005344860255718231, "timestamp": "2025-09-15 03:16:43.371627", "step": 869, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:43.401841", "step": 869, "epoch": 2 }, { "type": "loss", "content": 0.015962885692715645, "timestamp": "2025-09-15 03:16:43.406640", "step": 870, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:43.436453", "step": 870, "epoch": 2 }, { "type": "loss", "content": 0.008724975399672985, "timestamp": "2025-09-15 03:16:43.440979", "step": 871, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:43.472403", "step": 871, "epoch": 2 }, { "type": "loss", "content": 0.0020475266501307487, "timestamp": "2025-09-15 03:16:43.500064", "step": 872, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:43.530208", "step": 872, "epoch": 2 }, { "type": "loss", "content": 0.00470705796033144, "timestamp": "2025-09-15 03:16:43.532249", "step": 873, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:43.562594", "step": 873, "epoch": 2 }, { "type": "loss", "content": 0.005053224973380566, "timestamp": "2025-09-15 03:16:43.566313", "step": 874, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:43.596323", "step": 874, "epoch": 2 }, { "type": "loss", "content": 0.009020118974149227, "timestamp": "2025-09-15 03:16:43.598669", "step": 875, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:43.629073", "step": 875, "epoch": 2 }, { "type": "loss", "content": 0.014806902967393398, "timestamp": "2025-09-15 03:16:43.652929", "step": 876, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:43.682761", "step": 876, "epoch": 2 }, { "type": "loss", "content": 0.021488988772034645, "timestamp": "2025-09-15 03:16:43.685056", "step": 877, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:43.714718", "step": 877, "epoch": 2 }, { "type": "loss", "content": 0.018449898809194565, "timestamp": "2025-09-15 03:16:43.716963", "step": 878, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:43.746605", "step": 878, "epoch": 2 }, { "type": "loss", "content": 0.0025834415573626757, "timestamp": "2025-09-15 03:16:43.748886", "step": 879, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:43.779851", "step": 879, "epoch": 2 }, { "type": "loss", "content": 0.0028492652345448732, "timestamp": "2025-09-15 03:16:43.808017", "step": 880, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:43.839151", "step": 880, "epoch": 2 }, { "type": "loss", "content": 0.004528566263616085, "timestamp": "2025-09-15 03:16:43.844761", "step": 881, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:43.874807", "step": 881, "epoch": 2 }, { "type": "loss", "content": 0.004636577796190977, "timestamp": "2025-09-15 03:16:43.876928", "step": 882, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:43.907477", "step": 882, "epoch": 2 }, { "type": "loss", "content": 0.0033082941081374884, "timestamp": "2025-09-15 03:16:43.914552", "step": 883, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:43.944312", "step": 883, "epoch": 2 }, { "type": "loss", "content": 0.005281738005578518, "timestamp": "2025-09-15 03:16:43.968222", "step": 884, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:43.999714", "step": 884, "epoch": 2 }, { "type": "loss", "content": 0.004062555264681578, "timestamp": "2025-09-15 03:16:44.002174", "step": 885, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:44.032133", "step": 885, "epoch": 2 }, { "type": "loss", "content": 0.007704237941652536, "timestamp": "2025-09-15 03:16:44.034499", "step": 886, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:44.063953", "step": 886, "epoch": 2 }, { "type": "loss", "content": 0.025397544726729393, "timestamp": "2025-09-15 03:16:44.066115", "step": 887, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:44.096062", "step": 887, "epoch": 2 }, { "type": "loss", "content": 0.00595526909455657, "timestamp": "2025-09-15 03:16:44.119988", "step": 888, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:44.149986", "step": 888, "epoch": 2 }, { "type": "loss", "content": 0.013230021111667156, "timestamp": "2025-09-15 03:16:44.152288", "step": 889, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:44.182439", "step": 889, "epoch": 2 }, { "type": "loss", "content": 0.002272603800520301, "timestamp": "2025-09-15 03:16:44.186721", "step": 890, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:44.217183", "step": 890, "epoch": 2 }, { "type": "loss", "content": 0.005084930453449488, "timestamp": "2025-09-15 03:16:44.219817", "step": 891, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:44.249994", "step": 891, "epoch": 2 }, { "type": "loss", "content": 0.013971127569675446, "timestamp": "2025-09-15 03:16:44.275538", "step": 892, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:44.305812", "step": 892, "epoch": 2 }, { "type": "loss", "content": 0.001800144906155765, "timestamp": "2025-09-15 03:16:44.307843", "step": 893, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:44.337821", "step": 893, "epoch": 2 }, { "type": "loss", "content": 0.005557761527597904, "timestamp": "2025-09-15 03:16:44.342537", "step": 894, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:44.373589", "step": 894, "epoch": 2 }, { "type": "loss", "content": 0.014650067314505577, "timestamp": "2025-09-15 03:16:44.380529", "step": 895, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:44.411601", "step": 895, "epoch": 2 }, { "type": "loss", "content": 0.001939317211508751, "timestamp": "2025-09-15 03:16:44.440192", "step": 896, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:44.470858", "step": 896, "epoch": 2 }, { "type": "loss", "content": 0.0019788548815995455, "timestamp": "2025-09-15 03:16:44.475506", "step": 897, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:45.066861", "step": 897, "epoch": 2 }, { "type": "pplx", "content": 97664972.25220236, "timestamp": "2025-09-15 03:16:45.068990", "step": 897, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:45.097613", "step": 897, "epoch": 2 }, { "type": "loss", "content": 0.0019209941383451223, "timestamp": "2025-09-15 03:16:45.099566", "step": 898, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:45.130259", "step": 898, "epoch": 2 }, { "type": "loss", "content": 0.0007233588839881122, "timestamp": "2025-09-15 03:16:45.140677", "step": 899, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:45.171112", "step": 899, "epoch": 2 }, { "type": "loss", "content": 0.0020077857188880444, "timestamp": "2025-09-15 03:16:45.195064", "step": 900, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:45.225585", "step": 900, "epoch": 2 }, { "type": "loss", "content": 0.0025911724660545588, "timestamp": "2025-09-15 03:16:45.230893", "step": 901, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:45.260717", "step": 901, "epoch": 2 }, { "type": "loss", "content": 0.004694484639912844, "timestamp": "2025-09-15 03:16:45.265555", "step": 902, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:45.295991", "step": 902, "epoch": 2 }, { "type": "loss", "content": 0.0017370609566569328, "timestamp": "2025-09-15 03:16:45.300372", "step": 903, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:45.330404", "step": 903, "epoch": 2 }, { "type": "loss", "content": 0.003571173409000039, "timestamp": "2025-09-15 03:16:45.356017", "step": 904, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:45.386138", "step": 904, "epoch": 2 }, { "type": "loss", "content": 0.004272869322448969, "timestamp": "2025-09-15 03:16:45.389629", "step": 905, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:45.419997", "step": 905, "epoch": 2 }, { "type": "loss", "content": 0.003742960514500737, "timestamp": "2025-09-15 03:16:45.427854", "step": 906, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:16:45.458372", "step": 906, "epoch": 2 }, { "type": "loss", "content": 0.0016657983651384711, "timestamp": "2025-09-15 03:16:45.470813", "step": 907, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:45.501490", "step": 907, "epoch": 2 }, { "type": "loss", "content": 0.0016500052297487855, "timestamp": "2025-09-15 03:16:45.527072", "step": 908, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:45.556842", "step": 908, "epoch": 2 }, { "type": "loss", "content": 0.002180821727961302, "timestamp": "2025-09-15 03:16:45.561938", "step": 909, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:45.592053", "step": 909, "epoch": 2 }, { "type": "loss", "content": 0.0025096870958805084, "timestamp": "2025-09-15 03:16:45.599135", "step": 910, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:45.630471", "step": 910, "epoch": 2 }, { "type": "loss", "content": 0.002849652897566557, "timestamp": "2025-09-15 03:16:45.635284", "step": 911, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:45.670223", "step": 911, "epoch": 2 }, { "type": "loss", "content": 0.003814171999692917, "timestamp": "2025-09-15 03:16:45.695612", "step": 912, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:45.725912", "step": 912, "epoch": 2 }, { "type": "loss", "content": 0.00793201383203268, "timestamp": "2025-09-15 03:16:45.728924", "step": 913, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:45.759626", "step": 913, "epoch": 2 }, { "type": "loss", "content": 0.0019563245587050915, "timestamp": "2025-09-15 03:16:45.762042", "step": 914, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:45.792523", "step": 914, "epoch": 2 }, { "type": "loss", "content": 0.0011040839599445462, "timestamp": "2025-09-15 03:16:45.795234", "step": 915, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:45.825711", "step": 915, "epoch": 2 }, { "type": "loss", "content": 0.0028216461651027203, "timestamp": "2025-09-15 03:16:45.849403", "step": 916, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:45.881013", "step": 916, "epoch": 2 }, { "type": "loss", "content": 0.0007526809931732714, "timestamp": "2025-09-15 03:16:45.883687", "step": 917, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:45.914280", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.0008043412235565484, "timestamp": "2025-09-15 03:16:45.921225", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:45.951448", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.001197122037410736, "timestamp": "2025-09-15 03:16:45.953901", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:45.984553", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.0014595863176509738, "timestamp": "2025-09-15 03:16:46.008093", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:46.038952", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.001012066495604813, "timestamp": "2025-09-15 03:16:46.041717", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:46.071851", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.00035511283203959465, "timestamp": "2025-09-15 03:16:46.076148", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:46.106451", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.000994206522591412, "timestamp": "2025-09-15 03:16:46.113750", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:46.143823", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.0010711466893553734, "timestamp": "2025-09-15 03:16:46.169656", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:46.199461", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.003031970700249076, "timestamp": "2025-09-15 03:16:46.201635", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:46.231677", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.0010669993935152888, "timestamp": "2025-09-15 03:16:46.236430", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:46.266552", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.004038740415126085, "timestamp": "2025-09-15 03:16:46.269391", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:46.299597", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.00037929200334474444, "timestamp": "2025-09-15 03:16:46.323281", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:46.353625", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.00020541806588880718, "timestamp": "2025-09-15 03:16:46.358221", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:46.387974", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.0017460313392803073, "timestamp": "2025-09-15 03:16:46.395060", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:46.425458", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.0005159589927643538, "timestamp": "2025-09-15 03:16:46.432681", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:46.463015", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.0029993560165166855, "timestamp": "2025-09-15 03:16:46.488289", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:46.518733", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.0013383693294599652, "timestamp": "2025-09-15 03:16:46.523490", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:46.553508", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.0037266218569129705, "timestamp": "2025-09-15 03:16:46.555643", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:46.586127", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.0014700050232931972, "timestamp": "2025-09-15 03:16:46.588879", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:46.619999", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.0016449715476483107, "timestamp": "2025-09-15 03:16:46.644009", "step": 936, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:47.230087", "step": 936, "epoch": 2 }, { "type": "pplx", "content": 117820981.15308204, "timestamp": "2025-09-15 03:16:47.231764", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:47.259736", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.00042086487519554794, "timestamp": "2025-09-15 03:16:47.261916", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:47.292556", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.0002839158405549824, "timestamp": "2025-09-15 03:16:47.296766", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:47.327598", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.00030601557227782905, "timestamp": "2025-09-15 03:16:47.332396", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:47.364109", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.00037333546788431704, "timestamp": "2025-09-15 03:16:47.388182", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:47.419364", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.0013638328528031707, "timestamp": "2025-09-15 03:16:47.421913", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:47.453328", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.0005319089395925403, "timestamp": "2025-09-15 03:16:47.455415", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:47.485061", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.00039939445559866726, "timestamp": "2025-09-15 03:16:47.487018", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:47.516953", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.002649063942953944, "timestamp": "2025-09-15 03:16:47.540563", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:47.571323", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.0003652539453469217, "timestamp": "2025-09-15 03:16:47.576702", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:47.606794", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.0002454536152072251, "timestamp": "2025-09-15 03:16:47.611600", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:47.641913", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.0004982949467375875, "timestamp": "2025-09-15 03:16:47.646251", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:47.676735", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.0007253338699229062, "timestamp": "2025-09-15 03:16:47.702580", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:47.732998", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.001324442564509809, "timestamp": "2025-09-15 03:16:47.735102", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:47.764092", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.0002965231833513826, "timestamp": "2025-09-15 03:16:47.766287", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:47.796296", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.003919391892850399, "timestamp": "2025-09-15 03:16:47.798768", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:47.829520", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.0001225820742547512, "timestamp": "2025-09-15 03:16:47.858087", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:47.887663", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.00019038261962123215, "timestamp": "2025-09-15 03:16:47.890124", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:47.920165", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.0002864906855393201, "timestamp": "2025-09-15 03:16:47.922554", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:47.952164", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.0006765606231056154, "timestamp": "2025-09-15 03:16:47.954372", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:47.985059", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.00018442471628077328, "timestamp": "2025-09-15 03:16:48.013313", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:48.043057", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.003149544121697545, "timestamp": "2025-09-15 03:16:48.045009", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:48.074542", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.0019628521986305714, "timestamp": "2025-09-15 03:16:48.076662", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:48.106578", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.0011997123947367072, "timestamp": "2025-09-15 03:16:48.108751", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:48.138763", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.002039334736764431, "timestamp": "2025-09-15 03:16:48.162668", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:48.192851", "step": 960, "epoch": 2 }, { "type": "loss", "content": 8.800149953458458e-05, "timestamp": "2025-09-15 03:16:48.195378", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:48.225502", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.0015280414372682571, "timestamp": "2025-09-15 03:16:48.227626", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:48.257614", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.00011810220166807994, "timestamp": "2025-09-15 03:16:48.260475", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:48.290224", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.019818326458334923, "timestamp": "2025-09-15 03:16:48.318361", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:48.347748", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.0020504810381680727, "timestamp": "2025-09-15 03:16:48.349696", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:48.380277", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.0007288079359568655, "timestamp": "2025-09-15 03:16:48.384560", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:48.414652", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.0011788300471380353, "timestamp": "2025-09-15 03:16:48.417759", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:48.454870", "step": 967, "epoch": 2 }, { "type": "loss", "content": 0.0001325029879808426, "timestamp": "2025-09-15 03:16:48.483130", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:48.513336", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.0010475792223587632, "timestamp": "2025-09-15 03:16:48.515610", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:48.545754", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.0006575215957127512, "timestamp": "2025-09-15 03:16:48.548624", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:48.578607", "step": 970, "epoch": 2 }, { "type": "loss", "content": 7.66265657148324e-05, "timestamp": "2025-09-15 03:16:48.583485", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:48.613311", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.008739389479160309, "timestamp": "2025-09-15 03:16:48.636856", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:48.668338", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.00012891367077827454, "timestamp": "2025-09-15 03:16:48.673295", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:48.703191", "step": 973, "epoch": 2 }, { "type": "loss", "content": 9.70660476014018e-05, "timestamp": "2025-09-15 03:16:48.707453", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:48.737410", "step": 974, "epoch": 2 }, { "type": "loss", "content": 8.728348620934412e-05, "timestamp": "2025-09-15 03:16:48.744365", "step": 975, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:49.345637", "step": 975, "epoch": 2 }, { "type": "pplx", "content": 130597413.28941588, "timestamp": "2025-09-15 03:16:49.350763", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:49.379334", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.00420480826869607, "timestamp": "2025-09-15 03:16:49.403089", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:49.433144", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.032693080604076385, "timestamp": "2025-09-15 03:16:49.435199", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:49.465452", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.0009658890776336193, "timestamp": "2025-09-15 03:16:49.467727", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:49.497553", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.0034255923237651587, "timestamp": "2025-09-15 03:16:49.504999", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:49.535164", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.0006966963992454112, "timestamp": "2025-09-15 03:16:49.559057", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:49.590644", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.034857045859098434, "timestamp": "2025-09-15 03:16:49.593073", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:49.625258", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.0002292133867740631, "timestamp": "2025-09-15 03:16:49.627739", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:16:49.658448", "step": 982, "epoch": 2 }, { "type": "loss", "content": 8.10977871879004e-05, "timestamp": "2025-09-15 03:16:49.666318", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:49.696602", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.006245698779821396, "timestamp": "2025-09-15 03:16:49.720473", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:49.751066", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.0010302267037332058, "timestamp": "2025-09-15 03:16:49.753521", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:49.783624", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.0007164975395426154, "timestamp": "2025-09-15 03:16:49.787905", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:49.817434", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.0336216501891613, "timestamp": "2025-09-15 03:16:49.819809", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:49.850210", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.0003482494503259659, "timestamp": "2025-09-15 03:16:49.873728", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:49.904007", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.0004796452703885734, "timestamp": "2025-09-15 03:16:49.906197", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:49.936436", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.00031821359880268574, "timestamp": "2025-09-15 03:16:49.938832", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:49.969477", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.010607955977320671, "timestamp": "2025-09-15 03:16:49.974161", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:50.004515", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.0005565133760683239, "timestamp": "2025-09-15 03:16:50.028045", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:50.059228", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.005336018744856119, "timestamp": "2025-09-15 03:16:50.061416", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:50.092213", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.002163677243515849, "timestamp": "2025-09-15 03:16:50.096736", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:50.127355", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.008492656983435154, "timestamp": "2025-09-15 03:16:50.129744", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:50.160754", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.004409152548760176, "timestamp": "2025-09-15 03:16:50.184430", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:50.214949", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.04378414899110794, "timestamp": "2025-09-15 03:16:50.216905", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:50.247859", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.011165916919708252, "timestamp": "2025-09-15 03:16:50.252116", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:50.283378", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.00016595398483332247, "timestamp": "2025-09-15 03:16:50.289967", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:50.321182", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.0020952578634023666, "timestamp": "2025-09-15 03:16:50.346121", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-15 03:16:57.070929", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:57.133918", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.00026529579190537333, "timestamp": "2025-09-15 03:16:57.137117", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:57.169836", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.0021564450580626726, "timestamp": "2025-09-15 03:16:57.173725", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:57.204137", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 9.368791506858543e-05, "timestamp": "2025-09-15 03:16:57.206838", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:57.237714", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.0009809520561248064, "timestamp": "2025-09-15 03:16:57.262247", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:57.293076", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.003123172326013446, "timestamp": "2025-09-15 03:16:57.297136", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:57.327447", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.002402219222858548, "timestamp": "2025-09-15 03:16:57.334030", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:57.364845", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.00017964097787626088, "timestamp": "2025-09-15 03:16:57.371356", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:57.403222", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.0009959181770682335, "timestamp": "2025-09-15 03:16:57.427266", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:57.458850", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.013769718818366528, "timestamp": "2025-09-15 03:16:57.463508", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:57.494685", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.005754461046308279, "timestamp": "2025-09-15 03:16:57.496945", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:57.527905", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.013897545635700226, "timestamp": "2025-09-15 03:16:57.529969", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:57.560739", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.0009429440833628178, "timestamp": "2025-09-15 03:16:57.584626", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:57.616118", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.004884080495685339, "timestamp": "2025-09-15 03:16:57.620063", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:57.652340", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.00030809929012320936, "timestamp": "2025-09-15 03:16:57.654556", "step": 1014, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:16:58.246830", "step": 1014, "epoch": 2 }, { "type": "pplx", "content": 133013128.32087575, "timestamp": "2025-09-15 03:16:58.248804", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:58.277860", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.02146388776600361, "timestamp": "2025-09-15 03:16:58.280024", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:16:58.311708", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.00024146329087670892, "timestamp": "2025-09-15 03:16:58.338981", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:58.369735", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.0033533659297972918, "timestamp": "2025-09-15 03:16:58.373452", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:58.406216", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.0004804394266102463, "timestamp": "2025-09-15 03:16:58.418074", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:58.453288", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 9.127042721956968e-05, "timestamp": "2025-09-15 03:16:58.462898", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:58.495023", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.005294309463351965, "timestamp": "2025-09-15 03:16:58.519323", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:58.553035", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.034268833696842194, "timestamp": "2025-09-15 03:16:58.557124", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:58.591270", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.0009754736674949527, "timestamp": "2025-09-15 03:16:58.605187", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:58.639155", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.001966263633221388, "timestamp": "2025-09-15 03:16:58.642459", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:58.673939", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.021047789603471756, "timestamp": "2025-09-15 03:16:58.699063", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:58.731595", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.013026480562984943, "timestamp": "2025-09-15 03:16:58.736599", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:58.767626", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.006008670665323734, "timestamp": "2025-09-15 03:16:58.774634", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:58.805623", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.0004582749679684639, "timestamp": "2025-09-15 03:16:58.807842", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:16:58.838737", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.006711532361805439, "timestamp": "2025-09-15 03:16:58.869644", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:58.902819", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.0003885440528392792, "timestamp": "2025-09-15 03:16:58.914280", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:58.959725", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.0003321361436974257, "timestamp": "2025-09-15 03:16:58.977293", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.020648", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.0006997405434958637, "timestamp": "2025-09-15 03:16:59.034078", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.080505", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.00020196476543787867, "timestamp": "2025-09-15 03:16:59.113059", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.146827", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.0006217014160938561, "timestamp": "2025-09-15 03:16:59.150555", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:59.182179", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.00045512375072576106, "timestamp": "2025-09-15 03:16:59.185915", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.216866", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.0001574978668941185, "timestamp": "2025-09-15 03:16:59.219089", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:59.250202", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.0007082565571181476, "timestamp": "2025-09-15 03:16:59.274443", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:16:59.305602", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.0003342053678352386, "timestamp": "2025-09-15 03:16:59.308300", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:16:59.339682", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.0035582254640758038, "timestamp": "2025-09-15 03:16:59.346312", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:16:59.377184", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.0013459293404594064, "timestamp": "2025-09-15 03:16:59.379689", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:59.411229", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.0205552838742733, "timestamp": "2025-09-15 03:16:59.439234", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:59.470110", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.007523669395595789, "timestamp": "2025-09-15 03:16:59.472328", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-15 03:16:59.505554", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.00014045715215615928, "timestamp": "2025-09-15 03:16:59.518990", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.556205", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.006666823290288448, "timestamp": "2025-09-15 03:16:59.558569", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:16:59.589159", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.0002057207893813029, "timestamp": "2025-09-15 03:16:59.621802", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:16:59.668732", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.004083287436515093, "timestamp": "2025-09-15 03:16:59.686645", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:16:59.731960", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.004076201934367418, "timestamp": "2025-09-15 03:16:59.745434", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.780555", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.00010849825775949284, "timestamp": "2025-09-15 03:16:59.782985", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:59.814671", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.00018463960441295058, "timestamp": "2025-09-15 03:16:59.838427", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:16:59.869922", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 9.18033329071477e-05, "timestamp": "2025-09-15 03:16:59.872883", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:59.904326", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.0013023499632254243, "timestamp": "2025-09-15 03:16:59.908682", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:16:59.942538", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.00042243319330736995, "timestamp": "2025-09-15 03:16:59.945859", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:16:59.975909", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.0019100798526778817, "timestamp": "2025-09-15 03:17:00.002158", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:00.042626", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.004875184502452612, "timestamp": "2025-09-15 03:17:00.053457", "step": 1053, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:00.641377", "step": 1053, "epoch": 2 }, { "type": "pplx", "content": 128433445.53436927, "timestamp": "2025-09-15 03:17:00.643332", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:00.672739", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.00015715994231868535, "timestamp": "2025-09-15 03:17:00.679798", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:00.710352", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.0015639655757695436, "timestamp": "2025-09-15 03:17:00.712423", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:00.742595", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.001694310107268393, "timestamp": "2025-09-15 03:17:00.770164", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:00.801004", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.00971940066665411, "timestamp": "2025-09-15 03:17:00.803232", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:00.833076", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.010265431366860867, "timestamp": "2025-09-15 03:17:00.835373", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:00.865884", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.001557617331854999, "timestamp": "2025-09-15 03:17:00.873290", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:00.904138", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.0001345228374702856, "timestamp": "2025-09-15 03:17:00.927733", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:00.959287", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.0007394430576823652, "timestamp": "2025-09-15 03:17:00.963858", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:00.994856", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.0004073964955750853, "timestamp": "2025-09-15 03:17:00.997074", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:01.028350", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.004258634988218546, "timestamp": "2025-09-15 03:17:01.030874", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:01.061520", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.0001565528364153579, "timestamp": "2025-09-15 03:17:01.089055", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:01.120900", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.01607772521674633, "timestamp": "2025-09-15 03:17:01.123003", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:01.154524", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.00016873667482286692, "timestamp": "2025-09-15 03:17:01.161852", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.193229", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 0.00010801710595842451, "timestamp": "2025-09-15 03:17:01.195342", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.226064", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 6.337455852190033e-05, "timestamp": "2025-09-15 03:17:01.249880", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.280909", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 7.294497481780127e-05, "timestamp": "2025-09-15 03:17:01.283131", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:01.314083", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.00018661771900951862, "timestamp": "2025-09-15 03:17:01.316371", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:01.347410", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 7.31277177692391e-05, "timestamp": "2025-09-15 03:17:01.349480", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:01.379818", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.003055393695831299, "timestamp": "2025-09-15 03:17:01.403502", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:01.434488", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.0011397640919312835, "timestamp": "2025-09-15 03:17:01.436578", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:01.466626", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.00018760105012916028, "timestamp": "2025-09-15 03:17:01.469116", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.499841", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.03531399741768837, "timestamp": "2025-09-15 03:17:01.502125", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:01.533233", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.0020788447000086308, "timestamp": "2025-09-15 03:17:01.558353", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.589433", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.006431459914892912, "timestamp": "2025-09-15 03:17:01.591636", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:01.622417", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.00033594778506085277, "timestamp": "2025-09-15 03:17:01.626190", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:01.656675", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.000293986959150061, "timestamp": "2025-09-15 03:17:01.663368", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:01.694947", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 6.315172504400834e-05, "timestamp": "2025-09-15 03:17:01.725903", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:01.756602", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.0007892033900134265, "timestamp": "2025-09-15 03:17:01.761025", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.791155", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.0016475154552608728, "timestamp": "2025-09-15 03:17:01.793188", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:01.823978", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.00014074041973799467, "timestamp": "2025-09-15 03:17:01.825973", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:01.857526", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.010327513329684734, "timestamp": "2025-09-15 03:17:01.881062", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:01.912737", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.03140605241060257, "timestamp": "2025-09-15 03:17:01.914853", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:01.945337", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 0.00010317912528989837, "timestamp": "2025-09-15 03:17:01.947950", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:01.978531", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.00036211867700330913, "timestamp": "2025-09-15 03:17:01.981393", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:02.011645", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.0008095457451418042, "timestamp": "2025-09-15 03:17:02.036809", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:02.067225", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.0014814898604527116, "timestamp": "2025-09-15 03:17:02.069343", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:02.100634", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.0004058975900989026, "timestamp": "2025-09-15 03:17:02.104219", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:02.134852", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.0005335028981789947, "timestamp": "2025-09-15 03:17:02.137050", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:02.168095", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 7.210144394775853e-05, "timestamp": "2025-09-15 03:17:02.191705", "step": 1092, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:02.783176", "step": 1092, "epoch": 2 }, { "type": "pplx", "content": 125546625.83764112, "timestamp": "2025-09-15 03:17:02.785083", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:02.814532", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.00045872549526393414, "timestamp": "2025-09-15 03:17:02.816946", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:02.849473", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.0002017955994233489, "timestamp": "2025-09-15 03:17:02.851516", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:02.882138", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.014432324096560478, "timestamp": "2025-09-15 03:17:02.886230", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:02.917625", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.00027981828316114843, "timestamp": "2025-09-15 03:17:02.945326", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:02.976326", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.00010119278158526868, "timestamp": "2025-09-15 03:17:02.978535", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:03.016790", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.0008557759574614465, "timestamp": "2025-09-15 03:17:03.021161", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:03.052725", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.0012092305114492774, "timestamp": "2025-09-15 03:17:03.059485", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:03.089674", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.02371017076075077, "timestamp": "2025-09-15 03:17:03.113295", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.146225", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.0003077341243624687, "timestamp": "2025-09-15 03:17:03.148479", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:03.179937", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.0024734383914619684, "timestamp": "2025-09-15 03:17:03.183518", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.214077", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.0008149113273248076, "timestamp": "2025-09-15 03:17:03.216186", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:03.248301", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.0012873790692538023, "timestamp": "2025-09-15 03:17:03.272061", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:03.302760", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.00025210640160366893, "timestamp": "2025-09-15 03:17:03.305310", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:03.336477", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.0013830851530656219, "timestamp": "2025-09-15 03:17:03.340368", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:03.371161", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.0004376484430395067, "timestamp": "2025-09-15 03:17:03.373884", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:03.405828", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.005435534752905369, "timestamp": "2025-09-15 03:17:03.436659", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:03.467201", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.007525573950260878, "timestamp": "2025-09-15 03:17:03.469369", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:03.499274", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.0014803019585087895, "timestamp": "2025-09-15 03:17:03.501839", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.532383", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.0016765117179602385, "timestamp": "2025-09-15 03:17:03.534631", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:03.565067", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.0002567689516581595, "timestamp": "2025-09-15 03:17:03.588859", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:03.620315", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.00035767193185165524, "timestamp": "2025-09-15 03:17:03.623949", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.654710", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.002797603141516447, "timestamp": "2025-09-15 03:17:03.656797", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.687752", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.00018568325322121382, "timestamp": "2025-09-15 03:17:03.689987", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:03.722101", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.00015704214456491172, "timestamp": "2025-09-15 03:17:03.750370", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:03.781058", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.001136482460424304, "timestamp": "2025-09-15 03:17:03.783206", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:03.813877", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.0017352696740999818, "timestamp": "2025-09-15 03:17:03.816024", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:03.847254", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.00021176310838200152, "timestamp": "2025-09-15 03:17:03.851378", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:03.882018", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.01274438202381134, "timestamp": "2025-09-15 03:17:03.905831", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:03.936972", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.0004901143256574869, "timestamp": "2025-09-15 03:17:03.942212", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:03.972830", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.0018064085161313415, "timestamp": "2025-09-15 03:17:03.974979", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:04.006128", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.004721564706414938, "timestamp": "2025-09-15 03:17:04.012697", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:04.043430", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.022109517827630043, "timestamp": "2025-09-15 03:17:04.066826", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:04.097736", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.00010318388376617804, "timestamp": "2025-09-15 03:17:04.099961", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:17:04.130625", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.0013412336120381951, "timestamp": "2025-09-15 03:17:04.142684", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:04.173163", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 8.792152220848948e-05, "timestamp": "2025-09-15 03:17:04.177172", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:04.207499", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.014445015229284763, "timestamp": "2025-09-15 03:17:04.232346", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:04.262924", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.0215358417481184, "timestamp": "2025-09-15 03:17:04.264843", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:04.295527", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.0004923886153846979, "timestamp": "2025-09-15 03:17:04.302837", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:04.333266", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.00025934947188943624, "timestamp": "2025-09-15 03:17:04.335529", "step": 1131, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:04.942642", "step": 1131, "epoch": 2 }, { "type": "pplx", "content": 122099526.47280383, "timestamp": "2025-09-15 03:17:04.944835", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:04.974377", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.00045106312609277666, "timestamp": "2025-09-15 03:17:04.998323", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:05.029878", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.00016361901361960918, "timestamp": "2025-09-15 03:17:05.034062", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:05.064589", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.0002141120348824188, "timestamp": "2025-09-15 03:17:05.066971", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:05.097692", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.0009333607158623636, "timestamp": "2025-09-15 03:17:05.101781", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:05.132724", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.0034767123870551586, "timestamp": "2025-09-15 03:17:05.161155", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:05.192239", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.0001437151659047231, "timestamp": "2025-09-15 03:17:05.194436", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:05.224976", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.0009071500971913338, "timestamp": "2025-09-15 03:17:05.231972", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-15 03:17:05.262259", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.0008537854300811887, "timestamp": "2025-09-15 03:17:05.264356", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:05.294797", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.0015601081540808082, "timestamp": "2025-09-15 03:17:05.318331", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:05.349035", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.0004569217562675476, "timestamp": "2025-09-15 03:17:05.353546", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:05.385607", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.009549853391945362, "timestamp": "2025-09-15 03:17:05.389555", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:05.420992", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.00030883742147125304, "timestamp": "2025-09-15 03:17:05.423042", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:05.453101", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.005748115945607424, "timestamp": "2025-09-15 03:17:05.476823", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:05.507371", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.0008632918470539153, "timestamp": "2025-09-15 03:17:05.509481", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:05.539429", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.003175254911184311, "timestamp": "2025-09-15 03:17:05.543879", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:05.575058", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.00045585259795188904, "timestamp": "2025-09-15 03:17:05.581557", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:05.612199", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.005220616701990366, "timestamp": "2025-09-15 03:17:05.635904", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:05.667118", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.005695756059139967, "timestamp": "2025-09-15 03:17:05.669117", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:05.699733", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.005306906998157501, "timestamp": "2025-09-15 03:17:05.706426", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:05.737014", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.00037352906656451523, "timestamp": "2025-09-15 03:17:05.743739", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:05.774727", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.030560487881302834, "timestamp": "2025-09-15 03:17:05.798353", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:05.828614", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.002387533662840724, "timestamp": "2025-09-15 03:17:05.830772", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:05.861247", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.0002033365744864568, "timestamp": "2025-09-15 03:17:05.863116", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:05.893569", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 6.287390715442598e-05, "timestamp": "2025-09-15 03:17:05.895757", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:05.926717", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.0009798947721719742, "timestamp": "2025-09-15 03:17:05.950287", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:05.982146", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 7.104845280991867e-05, "timestamp": "2025-09-15 03:17:05.984187", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:06.014629", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 8.981012069853023e-05, "timestamp": "2025-09-15 03:17:06.016817", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:06.048115", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.0008855522028170526, "timestamp": "2025-09-15 03:17:06.051843", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:06.083815", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.005247277207672596, "timestamp": "2025-09-15 03:17:06.107239", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:06.138635", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.003661633934825659, "timestamp": "2025-09-15 03:17:06.140772", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:06.171486", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.004740457516163588, "timestamp": "2025-09-15 03:17:06.178110", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:06.209012", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.0006330661126412451, "timestamp": "2025-09-15 03:17:06.212899", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:06.243996", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.029458897188305855, "timestamp": "2025-09-15 03:17:06.271491", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:06.301940", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.0018987554358318448, "timestamp": "2025-09-15 03:17:06.304019", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:06.334234", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.02847990207374096, "timestamp": "2025-09-15 03:17:06.338249", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:06.369426", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.003378561232239008, "timestamp": "2025-09-15 03:17:06.371776", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:06.402487", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.00012590606638696045, "timestamp": "2025-09-15 03:17:06.426082", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:06.456755", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.008719941601157188, "timestamp": "2025-09-15 03:17:06.459685", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:06.491056", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.0001788309746189043, "timestamp": "2025-09-15 03:17:06.497885", "step": 1170, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:07.090935", "step": 1170, "epoch": 2 }, { "type": "pplx", "content": 120217233.33391637, "timestamp": "2025-09-15 03:17:07.092896", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.123131", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.0017449696315452456, "timestamp": "2025-09-15 03:17:07.125347", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.156096", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 5.670529208146036e-05, "timestamp": "2025-09-15 03:17:07.179700", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:07.210374", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.002883358160033822, "timestamp": "2025-09-15 03:17:07.212866", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.243655", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.00012270697334315628, "timestamp": "2025-09-15 03:17:07.245616", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:07.276156", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.00018073969113174826, "timestamp": "2025-09-15 03:17:07.282715", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:07.313486", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.00023215598776005208, "timestamp": "2025-09-15 03:17:07.337087", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:07.367681", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.0001968259020941332, "timestamp": "2025-09-15 03:17:07.370962", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:07.401091", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.00012490291555877775, "timestamp": "2025-09-15 03:17:07.403092", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:07.433654", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.0007915376918390393, "timestamp": "2025-09-15 03:17:07.437907", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:07.468907", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.0008563162409700453, "timestamp": "2025-09-15 03:17:07.494042", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:07.524473", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.000734034925699234, "timestamp": "2025-09-15 03:17:07.526753", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:07.557402", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.0029953443445265293, "timestamp": "2025-09-15 03:17:07.559647", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:07.590136", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.00032776501029729843, "timestamp": "2025-09-15 03:17:07.593866", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:07.625250", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.003944663796573877, "timestamp": "2025-09-15 03:17:07.652739", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:07.685969", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.0007250604103319347, "timestamp": "2025-09-15 03:17:07.690343", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.720759", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.00011083681602030993, "timestamp": "2025-09-15 03:17:07.722904", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:07.753963", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.0006312718614935875, "timestamp": "2025-09-15 03:17:07.758226", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:07.789002", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.0006342697306536138, "timestamp": "2025-09-15 03:17:07.816687", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.847386", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.006963182706385851, "timestamp": "2025-09-15 03:17:07.849520", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:07.879628", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.00021931444643996656, "timestamp": "2025-09-15 03:17:07.881968", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:07.913665", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.00013834961282555014, "timestamp": "2025-09-15 03:17:07.920242", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:07.951175", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 7.177861698437482e-05, "timestamp": "2025-09-15 03:17:07.974898", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:08.006731", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.0039025992155075073, "timestamp": "2025-09-15 03:17:08.008953", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:08.039818", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 8.208167128032073e-05, "timestamp": "2025-09-15 03:17:08.046515", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:08.077571", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.00020119234977755696, "timestamp": "2025-09-15 03:17:08.081437", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:08.113336", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.002922703977674246, "timestamp": "2025-09-15 03:17:08.137106", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:08.167695", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 5.8547982916934416e-05, "timestamp": "2025-09-15 03:17:08.171827", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:08.203339", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.005056845489889383, "timestamp": "2025-09-15 03:17:08.210471", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:08.241147", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 5.344694000086747e-05, "timestamp": "2025-09-15 03:17:08.243467", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:08.274387", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.0004614375939127058, "timestamp": "2025-09-15 03:17:08.297865", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:08.328913", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.0005172430537641048, "timestamp": "2025-09-15 03:17:08.331081", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:08.361990", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.0038748490624129772, "timestamp": "2025-09-15 03:17:08.368640", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:08.400412", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.00013763025344815105, "timestamp": "2025-09-15 03:17:08.404251", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:08.434970", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.00014176352124195546, "timestamp": "2025-09-15 03:17:08.460040", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:08.490957", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 6.545721407746896e-05, "timestamp": "2025-09-15 03:17:08.496008", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:08.526868", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.00043702914263121784, "timestamp": "2025-09-15 03:17:08.530901", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:08.561892", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.00014172600640449673, "timestamp": "2025-09-15 03:17:08.565895", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:08.596726", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.010499807074666023, "timestamp": "2025-09-15 03:17:08.620459", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:08.651516", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.00040008421638049185, "timestamp": "2025-09-15 03:17:08.653846", "step": 1209, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:09.242304", "step": 1209, "epoch": 2 }, { "type": "pplx", "content": 126998915.8734457, "timestamp": "2025-09-15 03:17:09.244291", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:09.272913", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.0004073081654496491, "timestamp": "2025-09-15 03:17:09.277152", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:09.307353", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.01717802695930004, "timestamp": "2025-09-15 03:17:09.309297", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:09.339861", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.0004054214514326304, "timestamp": "2025-09-15 03:17:09.364516", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.394786", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.00017672948888503015, "timestamp": "2025-09-15 03:17:09.397424", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:09.429154", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.00019165755657013506, "timestamp": "2025-09-15 03:17:09.433302", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.464113", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 3.5844284866470844e-05, "timestamp": "2025-09-15 03:17:09.466394", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.497093", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.00017610486247576773, "timestamp": "2025-09-15 03:17:09.520789", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:09.551283", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 8.69460855028592e-05, "timestamp": "2025-09-15 03:17:09.553499", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.584416", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.0020858903881162405, "timestamp": "2025-09-15 03:17:09.586838", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:09.617183", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.00037857977440580726, "timestamp": "2025-09-15 03:17:09.619561", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:09.650274", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.00012488874199334532, "timestamp": "2025-09-15 03:17:09.673861", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:09.704428", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 5.675610373145901e-05, "timestamp": "2025-09-15 03:17:09.706697", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:09.737747", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.00016086138202808797, "timestamp": "2025-09-15 03:17:09.739851", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:09.770677", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.013607896864414215, "timestamp": "2025-09-15 03:17:09.778253", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:09.809227", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.03053331933915615, "timestamp": "2025-09-15 03:17:09.832750", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:09.864133", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.00015589622489642352, "timestamp": "2025-09-15 03:17:09.868266", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.899025", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.0008411963353864849, "timestamp": "2025-09-15 03:17:09.902154", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:09.933017", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.00041311918175779283, "timestamp": "2025-09-15 03:17:09.937236", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:09.967738", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.0003874626418109983, "timestamp": "2025-09-15 03:17:09.991429", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:10.022233", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.0001830274413805455, "timestamp": "2025-09-15 03:17:10.024501", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:10.055105", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.05192428082227707, "timestamp": "2025-09-15 03:17:10.057900", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:10.088440", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 8.481832628604025e-05, "timestamp": "2025-09-15 03:17:10.090863", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:10.121758", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.0008766755345277488, "timestamp": "2025-09-15 03:17:10.146894", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:10.177054", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.00022905074001755565, "timestamp": "2025-09-15 03:17:10.179338", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:10.209814", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.007411510683596134, "timestamp": "2025-09-15 03:17:10.213938", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:10.247195", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.0008861946989782155, "timestamp": "2025-09-15 03:17:10.253766", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:10.285336", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 4.034994708490558e-05, "timestamp": "2025-09-15 03:17:10.312785", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:10.345919", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.0019980978686362505, "timestamp": "2025-09-15 03:17:10.347998", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:10.378577", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.001465094625018537, "timestamp": "2025-09-15 03:17:10.385492", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:10.415967", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.017953436821699142, "timestamp": "2025-09-15 03:17:10.418226", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:10.450453", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.0008248678059317172, "timestamp": "2025-09-15 03:17:10.478465", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:10.509760", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.012165131978690624, "timestamp": "2025-09-15 03:17:10.511731", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:10.542917", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.0002835580671671778, "timestamp": "2025-09-15 03:17:10.546925", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:10.577593", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.00029444595566019416, "timestamp": "2025-09-15 03:17:10.581373", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:10.612272", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.0013534992467612028, "timestamp": "2025-09-15 03:17:10.636768", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:10.668331", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.00040784329758025706, "timestamp": "2025-09-15 03:17:10.672322", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-15 03:17:10.703260", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.00012379752297420055, "timestamp": "2025-09-15 03:17:10.705681", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:10.755385", "step": 1246, "epoch": 3 }, { "type": "loss", "content": 0.004293285310268402, "timestamp": "2025-09-15 03:17:10.758474", "step": 1247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:10.789678", "step": 1247, "epoch": 3 }, { "type": "loss", "content": 0.002140170196071267, "timestamp": "2025-09-15 03:17:10.813333", "step": 1248, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:11.402373", "step": 1248, "epoch": 3 }, { "type": "pplx", "content": 125033048.54934657, "timestamp": "2025-09-15 03:17:11.404551", "step": 1248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:11.432890", "step": 1248, "epoch": 3 }, { "type": "loss", "content": 0.00016366194176953286, "timestamp": "2025-09-15 03:17:11.435041", "step": 1249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:11.465767", "step": 1249, "epoch": 3 }, { "type": "loss", "content": 0.0023328284732997417, "timestamp": "2025-09-15 03:17:11.469528", "step": 1250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:11.500230", "step": 1250, "epoch": 3 }, { "type": "loss", "content": 0.0001446189417038113, "timestamp": "2025-09-15 03:17:11.507410", "step": 1251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:11.537923", "step": 1251, "epoch": 3 }, { "type": "loss", "content": 0.0007846828666515648, "timestamp": "2025-09-15 03:17:11.563174", "step": 1252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:11.593624", "step": 1252, "epoch": 3 }, { "type": "loss", "content": 0.00610629515722394, "timestamp": "2025-09-15 03:17:11.595673", "step": 1253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:11.625907", "step": 1253, "epoch": 3 }, { "type": "loss", "content": 0.03473528474569321, "timestamp": "2025-09-15 03:17:11.630120", "step": 1254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:11.661343", "step": 1254, "epoch": 3 }, { "type": "loss", "content": 0.0007956092595122755, "timestamp": "2025-09-15 03:17:11.663421", "step": 1255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:11.693649", "step": 1255, "epoch": 3 }, { "type": "loss", "content": 0.0005844756960868835, "timestamp": "2025-09-15 03:17:11.717118", "step": 1256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:11.748174", "step": 1256, "epoch": 3 }, { "type": "loss", "content": 0.005617126356810331, "timestamp": "2025-09-15 03:17:11.752873", "step": 1257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:11.784850", "step": 1257, "epoch": 3 }, { "type": "loss", "content": 0.00012601922207977623, "timestamp": "2025-09-15 03:17:11.788471", "step": 1258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:11.819867", "step": 1258, "epoch": 3 }, { "type": "loss", "content": 7.824383646948263e-05, "timestamp": "2025-09-15 03:17:11.822295", "step": 1259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:11.853192", "step": 1259, "epoch": 3 }, { "type": "loss", "content": 0.004577727522701025, "timestamp": "2025-09-15 03:17:11.876651", "step": 1260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:11.907607", "step": 1260, "epoch": 3 }, { "type": "loss", "content": 0.03704509511590004, "timestamp": "2025-09-15 03:17:11.909993", "step": 1261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:11.941140", "step": 1261, "epoch": 3 }, { "type": "loss", "content": 0.002187976147979498, "timestamp": "2025-09-15 03:17:11.943525", "step": 1262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:11.974060", "step": 1262, "epoch": 3 }, { "type": "loss", "content": 0.0003167348331771791, "timestamp": "2025-09-15 03:17:11.976351", "step": 1263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.006870", "step": 1263, "epoch": 3 }, { "type": "loss", "content": 0.00025892173289321363, "timestamp": "2025-09-15 03:17:12.034443", "step": 1264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:12.066523", "step": 1264, "epoch": 3 }, { "type": "loss", "content": 0.0006322207627817988, "timestamp": "2025-09-15 03:17:12.068514", "step": 1265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.099093", "step": 1265, "epoch": 3 }, { "type": "loss", "content": 0.0007802381296642125, "timestamp": "2025-09-15 03:17:12.106323", "step": 1266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.138278", "step": 1266, "epoch": 3 }, { "type": "loss", "content": 0.0011284631909802556, "timestamp": "2025-09-15 03:17:12.144637", "step": 1267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:12.184579", "step": 1267, "epoch": 3 }, { "type": "loss", "content": 0.004695535637438297, "timestamp": "2025-09-15 03:17:12.208286", "step": 1268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:12.239092", "step": 1268, "epoch": 3 }, { "type": "loss", "content": 0.02026580460369587, "timestamp": "2025-09-15 03:17:12.241220", "step": 1269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:12.273008", "step": 1269, "epoch": 3 }, { "type": "loss", "content": 0.02153332531452179, "timestamp": "2025-09-15 03:17:12.277407", "step": 1270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.308163", "step": 1270, "epoch": 3 }, { "type": "loss", "content": 0.00013611149915959686, "timestamp": "2025-09-15 03:17:12.314720", "step": 1271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:12.349512", "step": 1271, "epoch": 3 }, { "type": "loss", "content": 0.0013938635820522904, "timestamp": "2025-09-15 03:17:12.377986", "step": 1272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:12.411824", "step": 1272, "epoch": 3 }, { "type": "loss", "content": 0.021907132118940353, "timestamp": "2025-09-15 03:17:12.414248", "step": 1273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:12.445187", "step": 1273, "epoch": 3 }, { "type": "loss", "content": 0.00254437536932528, "timestamp": "2025-09-15 03:17:12.447166", "step": 1274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:12.486922", "step": 1274, "epoch": 3 }, { "type": "loss", "content": 0.0045538186095654964, "timestamp": "2025-09-15 03:17:12.493651", "step": 1275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:12.525206", "step": 1275, "epoch": 3 }, { "type": "loss", "content": 0.00021196853776928037, "timestamp": "2025-09-15 03:17:12.549122", "step": 1276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:12.581726", "step": 1276, "epoch": 3 }, { "type": "loss", "content": 0.00013388384832069278, "timestamp": "2025-09-15 03:17:12.583993", "step": 1277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:12.615311", "step": 1277, "epoch": 3 }, { "type": "loss", "content": 0.0018024734454229474, "timestamp": "2025-09-15 03:17:12.622011", "step": 1278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.659800", "step": 1278, "epoch": 3 }, { "type": "loss", "content": 0.008215748704969883, "timestamp": "2025-09-15 03:17:12.665937", "step": 1279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.697138", "step": 1279, "epoch": 3 }, { "type": "loss", "content": 0.0039006711449474096, "timestamp": "2025-09-15 03:17:12.724650", "step": 1280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:12.757677", "step": 1280, "epoch": 3 }, { "type": "loss", "content": 0.000163158867508173, "timestamp": "2025-09-15 03:17:12.761202", "step": 1281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:12.792137", "step": 1281, "epoch": 3 }, { "type": "loss", "content": 0.005478131119161844, "timestamp": "2025-09-15 03:17:12.794229", "step": 1282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:12.824751", "step": 1282, "epoch": 3 }, { "type": "loss", "content": 0.00642383610829711, "timestamp": "2025-09-15 03:17:12.830408", "step": 1283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:12.865609", "step": 1283, "epoch": 3 }, { "type": "loss", "content": 0.0033451267518103123, "timestamp": "2025-09-15 03:17:12.889224", "step": 1284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:12.919833", "step": 1284, "epoch": 3 }, { "type": "loss", "content": 0.015903616324067116, "timestamp": "2025-09-15 03:17:12.921990", "step": 1285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:12.960698", "step": 1285, "epoch": 3 }, { "type": "loss", "content": 0.0002753768640104681, "timestamp": "2025-09-15 03:17:12.964625", "step": 1286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:12.995067", "step": 1286, "epoch": 3 }, { "type": "loss", "content": 0.0006740019307471812, "timestamp": "2025-09-15 03:17:13.001635", "step": 1287, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:13.588533", "step": 1287, "epoch": 3 }, { "type": "pplx", "content": 117707103.54334375, "timestamp": "2025-09-15 03:17:13.590451", "step": 1287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:13.620322", "step": 1287, "epoch": 3 }, { "type": "loss", "content": 0.017900949344038963, "timestamp": "2025-09-15 03:17:13.644774", "step": 1288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:13.676991", "step": 1288, "epoch": 3 }, { "type": "loss", "content": 0.00039724045200273395, "timestamp": "2025-09-15 03:17:13.679062", "step": 1289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:13.710333", "step": 1289, "epoch": 3 }, { "type": "loss", "content": 0.01001682598143816, "timestamp": "2025-09-15 03:17:13.716860", "step": 1290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:13.748463", "step": 1290, "epoch": 3 }, { "type": "loss", "content": 0.0025902953930199146, "timestamp": "2025-09-15 03:17:13.751489", "step": 1291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:13.782639", "step": 1291, "epoch": 3 }, { "type": "loss", "content": 0.00179091386962682, "timestamp": "2025-09-15 03:17:13.807955", "step": 1292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:13.838599", "step": 1292, "epoch": 3 }, { "type": "loss", "content": 0.0013810384552925825, "timestamp": "2025-09-15 03:17:13.840951", "step": 1293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:13.871492", "step": 1293, "epoch": 3 }, { "type": "loss", "content": 0.0022027629893273115, "timestamp": "2025-09-15 03:17:13.873514", "step": 1294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:13.904060", "step": 1294, "epoch": 3 }, { "type": "loss", "content": 0.0019874197896569967, "timestamp": "2025-09-15 03:17:13.906219", "step": 1295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:13.936900", "step": 1295, "epoch": 3 }, { "type": "loss", "content": 0.0016624435083940625, "timestamp": "2025-09-15 03:17:13.960697", "step": 1296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:13.990556", "step": 1296, "epoch": 3 }, { "type": "loss", "content": 0.00470365583896637, "timestamp": "2025-09-15 03:17:13.992458", "step": 1297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:14.023250", "step": 1297, "epoch": 3 }, { "type": "loss", "content": 0.00042901348206214607, "timestamp": "2025-09-15 03:17:14.025272", "step": 1298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:14.055612", "step": 1298, "epoch": 3 }, { "type": "loss", "content": 0.0024480712600052357, "timestamp": "2025-09-15 03:17:14.058083", "step": 1299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:14.088874", "step": 1299, "epoch": 3 }, { "type": "loss", "content": 0.0006906711496412754, "timestamp": "2025-09-15 03:17:14.112659", "step": 1300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:14.143299", "step": 1300, "epoch": 3 }, { "type": "loss", "content": 0.009458900429308414, "timestamp": "2025-09-15 03:17:14.145420", "step": 1301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:14.175812", "step": 1301, "epoch": 3 }, { "type": "loss", "content": 0.003652880433946848, "timestamp": "2025-09-15 03:17:14.177795", "step": 1302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:14.208538", "step": 1302, "epoch": 3 }, { "type": "loss", "content": 0.0004979693330824375, "timestamp": "2025-09-15 03:17:14.210726", "step": 1303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:14.241569", "step": 1303, "epoch": 3 }, { "type": "loss", "content": 0.007943389937281609, "timestamp": "2025-09-15 03:17:14.264988", "step": 1304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:14.296595", "step": 1304, "epoch": 3 }, { "type": "loss", "content": 0.008471814915537834, "timestamp": "2025-09-15 03:17:14.298487", "step": 1305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:14.328621", "step": 1305, "epoch": 3 }, { "type": "loss", "content": 0.03487579524517059, "timestamp": "2025-09-15 03:17:14.330760", "step": 1306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:14.361653", "step": 1306, "epoch": 3 }, { "type": "loss", "content": 0.002479907125234604, "timestamp": "2025-09-15 03:17:14.363872", "step": 1307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:14.396651", "step": 1307, "epoch": 3 }, { "type": "loss", "content": 0.00047891700523905456, "timestamp": "2025-09-15 03:17:14.421457", "step": 1308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:14.452004", "step": 1308, "epoch": 3 }, { "type": "loss", "content": 0.0009700975497253239, "timestamp": "2025-09-15 03:17:14.454102", "step": 1309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:14.484472", "step": 1309, "epoch": 3 }, { "type": "loss", "content": 0.0008935186197049916, "timestamp": "2025-09-15 03:17:14.488647", "step": 1310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:14.519384", "step": 1310, "epoch": 3 }, { "type": "loss", "content": 0.008795429021120071, "timestamp": "2025-09-15 03:17:14.522991", "step": 1311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:14.554193", "step": 1311, "epoch": 3 }, { "type": "loss", "content": 0.006600773893296719, "timestamp": "2025-09-15 03:17:14.577811", "step": 1312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:14.608655", "step": 1312, "epoch": 3 }, { "type": "loss", "content": 0.018867747858166695, "timestamp": "2025-09-15 03:17:14.611002", "step": 1313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:14.641810", "step": 1313, "epoch": 3 }, { "type": "loss", "content": 0.0008665714995004237, "timestamp": "2025-09-15 03:17:14.644014", "step": 1314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:14.674439", "step": 1314, "epoch": 3 }, { "type": "loss", "content": 0.0014345311792567372, "timestamp": "2025-09-15 03:17:14.678247", "step": 1315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:14.710011", "step": 1315, "epoch": 3 }, { "type": "loss", "content": 0.0004817911540158093, "timestamp": "2025-09-15 03:17:14.733548", "step": 1316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:14.764297", "step": 1316, "epoch": 3 }, { "type": "loss", "content": 0.0016825651982799172, "timestamp": "2025-09-15 03:17:14.767897", "step": 1317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:14.797840", "step": 1317, "epoch": 3 }, { "type": "loss", "content": 0.0014796556206420064, "timestamp": "2025-09-15 03:17:14.799997", "step": 1318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:14.830624", "step": 1318, "epoch": 3 }, { "type": "loss", "content": 0.00032859708881005645, "timestamp": "2025-09-15 03:17:14.832744", "step": 1319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:14.863545", "step": 1319, "epoch": 3 }, { "type": "loss", "content": 0.0011759724002331495, "timestamp": "2025-09-15 03:17:14.887673", "step": 1320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:14.918357", "step": 1320, "epoch": 3 }, { "type": "loss", "content": 7.884834485594183e-05, "timestamp": "2025-09-15 03:17:14.920322", "step": 1321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:14.951652", "step": 1321, "epoch": 3 }, { "type": "loss", "content": 0.00024336694332305342, "timestamp": "2025-09-15 03:17:14.955758", "step": 1322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:14.986712", "step": 1322, "epoch": 3 }, { "type": "loss", "content": 0.0012510953238233924, "timestamp": "2025-09-15 03:17:14.989087", "step": 1323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:15.019814", "step": 1323, "epoch": 3 }, { "type": "loss", "content": 0.012558196671307087, "timestamp": "2025-09-15 03:17:15.047203", "step": 1324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:15.077861", "step": 1324, "epoch": 3 }, { "type": "loss", "content": 0.03249092772603035, "timestamp": "2025-09-15 03:17:15.082532", "step": 1325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:15.113161", "step": 1325, "epoch": 3 }, { "type": "loss", "content": 0.0008610652876086533, "timestamp": "2025-09-15 03:17:15.117034", "step": 1326, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:15.703944", "step": 1326, "epoch": 3 }, { "type": "pplx", "content": 114827208.4781151, "timestamp": "2025-09-15 03:17:15.705866", "step": 1326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:15.735595", "step": 1326, "epoch": 3 }, { "type": "loss", "content": 0.0011096058879047632, "timestamp": "2025-09-15 03:17:15.741929", "step": 1327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:15.772332", "step": 1327, "epoch": 3 }, { "type": "loss", "content": 0.0001657214161241427, "timestamp": "2025-09-15 03:17:15.797420", "step": 1328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:15.828295", "step": 1328, "epoch": 3 }, { "type": "loss", "content": 0.00018686882685869932, "timestamp": "2025-09-15 03:17:15.833139", "step": 1329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:15.864085", "step": 1329, "epoch": 3 }, { "type": "loss", "content": 0.0023887602146714926, "timestamp": "2025-09-15 03:17:15.867848", "step": 1330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:15.899069", "step": 1330, "epoch": 3 }, { "type": "loss", "content": 0.00025507682585157454, "timestamp": "2025-09-15 03:17:15.905779", "step": 1331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:15.936741", "step": 1331, "epoch": 3 }, { "type": "loss", "content": 0.00046792227658443153, "timestamp": "2025-09-15 03:17:15.960274", "step": 1332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:15.990979", "step": 1332, "epoch": 3 }, { "type": "loss", "content": 0.0033070999197661877, "timestamp": "2025-09-15 03:17:15.995891", "step": 1333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:16.026181", "step": 1333, "epoch": 3 }, { "type": "loss", "content": 0.002882573287934065, "timestamp": "2025-09-15 03:17:16.028235", "step": 1334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:16.059017", "step": 1334, "epoch": 3 }, { "type": "loss", "content": 0.0002578755666036159, "timestamp": "2025-09-15 03:17:16.065682", "step": 1335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:16.096267", "step": 1335, "epoch": 3 }, { "type": "loss", "content": 0.0017550382763147354, "timestamp": "2025-09-15 03:17:16.120074", "step": 1336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:16.150812", "step": 1336, "epoch": 3 }, { "type": "loss", "content": 0.00020929174206685275, "timestamp": "2025-09-15 03:17:16.157749", "step": 1337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.190951", "step": 1337, "epoch": 3 }, { "type": "loss", "content": 0.0007545395055785775, "timestamp": "2025-09-15 03:17:16.195321", "step": 1338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:16.225885", "step": 1338, "epoch": 3 }, { "type": "loss", "content": 0.032332248985767365, "timestamp": "2025-09-15 03:17:16.229702", "step": 1339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.259973", "step": 1339, "epoch": 3 }, { "type": "loss", "content": 0.005746941082179546, "timestamp": "2025-09-15 03:17:16.285314", "step": 1340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.315600", "step": 1340, "epoch": 3 }, { "type": "loss", "content": 0.0004919808125123382, "timestamp": "2025-09-15 03:17:16.317829", "step": 1341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:16.348614", "step": 1341, "epoch": 3 }, { "type": "loss", "content": 0.0002653584233485162, "timestamp": "2025-09-15 03:17:16.352545", "step": 1342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:16.383219", "step": 1342, "epoch": 3 }, { "type": "loss", "content": 0.00044335488928481936, "timestamp": "2025-09-15 03:17:16.385358", "step": 1343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:16.415699", "step": 1343, "epoch": 3 }, { "type": "loss", "content": 0.0016609487356618047, "timestamp": "2025-09-15 03:17:16.439516", "step": 1344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:16.470661", "step": 1344, "epoch": 3 }, { "type": "loss", "content": 0.0016143351094797254, "timestamp": "2025-09-15 03:17:16.475637", "step": 1345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:16.517008", "step": 1345, "epoch": 3 }, { "type": "loss", "content": 0.0006745156715624034, "timestamp": "2025-09-15 03:17:16.519452", "step": 1346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:16.549564", "step": 1346, "epoch": 3 }, { "type": "loss", "content": 0.0014027913566678762, "timestamp": "2025-09-15 03:17:16.556047", "step": 1347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.586376", "step": 1347, "epoch": 3 }, { "type": "loss", "content": 0.0003329945320729166, "timestamp": "2025-09-15 03:17:16.611725", "step": 1348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:16.641501", "step": 1348, "epoch": 3 }, { "type": "loss", "content": 0.0014508651802316308, "timestamp": "2025-09-15 03:17:16.643638", "step": 1349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.674134", "step": 1349, "epoch": 3 }, { "type": "loss", "content": 0.0008561424911022186, "timestamp": "2025-09-15 03:17:16.678392", "step": 1350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:16.708629", "step": 1350, "epoch": 3 }, { "type": "loss", "content": 0.0001607358717592433, "timestamp": "2025-09-15 03:17:16.716321", "step": 1351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:16.746710", "step": 1351, "epoch": 3 }, { "type": "loss", "content": 0.0013879425823688507, "timestamp": "2025-09-15 03:17:16.770497", "step": 1352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:16.802699", "step": 1352, "epoch": 3 }, { "type": "loss", "content": 0.00041087184217758477, "timestamp": "2025-09-15 03:17:16.805119", "step": 1353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:16.835324", "step": 1353, "epoch": 3 }, { "type": "loss", "content": 0.0003274380578659475, "timestamp": "2025-09-15 03:17:16.837342", "step": 1354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:16.867347", "step": 1354, "epoch": 3 }, { "type": "loss", "content": 0.0003536785370670259, "timestamp": "2025-09-15 03:17:16.869553", "step": 1355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:16.900490", "step": 1355, "epoch": 3 }, { "type": "loss", "content": 0.007172483950853348, "timestamp": "2025-09-15 03:17:16.925213", "step": 1356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:16.955759", "step": 1356, "epoch": 3 }, { "type": "loss", "content": 0.004883793648332357, "timestamp": "2025-09-15 03:17:16.957832", "step": 1357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:16.987871", "step": 1357, "epoch": 3 }, { "type": "loss", "content": 0.00017396271869074553, "timestamp": "2025-09-15 03:17:16.989906", "step": 1358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:17.021886", "step": 1358, "epoch": 3 }, { "type": "loss", "content": 0.0007931128493510187, "timestamp": "2025-09-15 03:17:17.025637", "step": 1359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:17.056290", "step": 1359, "epoch": 3 }, { "type": "loss", "content": 0.00537915201857686, "timestamp": "2025-09-15 03:17:17.084613", "step": 1360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:17.115120", "step": 1360, "epoch": 3 }, { "type": "loss", "content": 0.0009680739603936672, "timestamp": "2025-09-15 03:17:17.117300", "step": 1361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:17.147655", "step": 1361, "epoch": 3 }, { "type": "loss", "content": 0.0007127004791982472, "timestamp": "2025-09-15 03:17:17.152103", "step": 1362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:17.182137", "step": 1362, "epoch": 3 }, { "type": "loss", "content": 0.0012851905776187778, "timestamp": "2025-09-15 03:17:17.184109", "step": 1363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:17.214588", "step": 1363, "epoch": 3 }, { "type": "loss", "content": 0.0003384167794138193, "timestamp": "2025-09-15 03:17:17.242119", "step": 1364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:17.272477", "step": 1364, "epoch": 3 }, { "type": "loss", "content": 0.0007884152000769973, "timestamp": "2025-09-15 03:17:17.274543", "step": 1365, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:17.856696", "step": 1365, "epoch": 3 }, { "type": "pplx", "content": 119068416.19124384, "timestamp": "2025-09-15 03:17:17.858747", "step": 1365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 288 ], "flops": 8543129804160 }, "timestamp": "2025-09-15 03:17:17.888098", "step": 1365, "epoch": 3 }, { "type": "loss", "content": 0.0002495049557182938, "timestamp": "2025-09-15 03:17:17.898583", "step": 1366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:17.929158", "step": 1366, "epoch": 3 }, { "type": "loss", "content": 0.0002520255511626601, "timestamp": "2025-09-15 03:17:17.933377", "step": 1367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:17.963662", "step": 1367, "epoch": 3 }, { "type": "loss", "content": 0.00036317037302069366, "timestamp": "2025-09-15 03:17:17.987417", "step": 1368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:18.017858", "step": 1368, "epoch": 3 }, { "type": "loss", "content": 0.0032210820354521275, "timestamp": "2025-09-15 03:17:18.020093", "step": 1369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:18.051133", "step": 1369, "epoch": 3 }, { "type": "loss", "content": 0.0008396569755859673, "timestamp": "2025-09-15 03:17:18.057839", "step": 1370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:18.088284", "step": 1370, "epoch": 3 }, { "type": "loss", "content": 0.002095963107421994, "timestamp": "2025-09-15 03:17:18.090526", "step": 1371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:18.120947", "step": 1371, "epoch": 3 }, { "type": "loss", "content": 0.001525636063888669, "timestamp": "2025-09-15 03:17:18.146237", "step": 1372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:18.176471", "step": 1372, "epoch": 3 }, { "type": "loss", "content": 0.00012938915460836142, "timestamp": "2025-09-15 03:17:18.178864", "step": 1373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:18.209188", "step": 1373, "epoch": 3 }, { "type": "loss", "content": 0.00017872024909593165, "timestamp": "2025-09-15 03:17:18.211448", "step": 1374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:18.243395", "step": 1374, "epoch": 3 }, { "type": "loss", "content": 0.0010354757541790605, "timestamp": "2025-09-15 03:17:18.247754", "step": 1375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:18.278609", "step": 1375, "epoch": 3 }, { "type": "loss", "content": 0.0005616145208477974, "timestamp": "2025-09-15 03:17:18.309279", "step": 1376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:18.340015", "step": 1376, "epoch": 3 }, { "type": "loss", "content": 0.0001830124092521146, "timestamp": "2025-09-15 03:17:18.342241", "step": 1377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:18.372319", "step": 1377, "epoch": 3 }, { "type": "loss", "content": 0.0011224261252209544, "timestamp": "2025-09-15 03:17:18.374380", "step": 1378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:18.404930", "step": 1378, "epoch": 3 }, { "type": "loss", "content": 0.0008521327981725335, "timestamp": "2025-09-15 03:17:18.408767", "step": 1379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:18.439865", "step": 1379, "epoch": 3 }, { "type": "loss", "content": 0.00017294203280471265, "timestamp": "2025-09-15 03:17:18.467276", "step": 1380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:18.498338", "step": 1380, "epoch": 3 }, { "type": "loss", "content": 0.003641907824203372, "timestamp": "2025-09-15 03:17:18.503055", "step": 1381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:18.533473", "step": 1381, "epoch": 3 }, { "type": "loss", "content": 0.00016708121984265745, "timestamp": "2025-09-15 03:17:18.537813", "step": 1382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:18.568192", "step": 1382, "epoch": 3 }, { "type": "loss", "content": 0.029590027406811714, "timestamp": "2025-09-15 03:17:18.570374", "step": 1383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:18.600642", "step": 1383, "epoch": 3 }, { "type": "loss", "content": 0.00025269907200708985, "timestamp": "2025-09-15 03:17:18.624692", "step": 1384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:18.659121", "step": 1384, "epoch": 3 }, { "type": "loss", "content": 0.002378477482125163, "timestamp": "2025-09-15 03:17:18.662081", "step": 1385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:18.693696", "step": 1385, "epoch": 3 }, { "type": "loss", "content": 8.785168029135093e-05, "timestamp": "2025-09-15 03:17:18.696709", "step": 1386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:18.727770", "step": 1386, "epoch": 3 }, { "type": "loss", "content": 0.0009063719771802425, "timestamp": "2025-09-15 03:17:18.730435", "step": 1387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:18.761287", "step": 1387, "epoch": 3 }, { "type": "loss", "content": 0.0010425930377095938, "timestamp": "2025-09-15 03:17:18.789671", "step": 1388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:18.820163", "step": 1388, "epoch": 3 }, { "type": "loss", "content": 0.0005211577517911792, "timestamp": "2025-09-15 03:17:18.822298", "step": 1389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:18.852697", "step": 1389, "epoch": 3 }, { "type": "loss", "content": 0.001825684099458158, "timestamp": "2025-09-15 03:17:18.854870", "step": 1390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:18.885119", "step": 1390, "epoch": 3 }, { "type": "loss", "content": 0.000622618361376226, "timestamp": "2025-09-15 03:17:18.887079", "step": 1391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:18.917409", "step": 1391, "epoch": 3 }, { "type": "loss", "content": 0.00014275507419370115, "timestamp": "2025-09-15 03:17:18.940865", "step": 1392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:18.971215", "step": 1392, "epoch": 3 }, { "type": "loss", "content": 0.0006256733322516084, "timestamp": "2025-09-15 03:17:18.973235", "step": 1393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:19.003969", "step": 1393, "epoch": 3 }, { "type": "loss", "content": 9.841892460826784e-05, "timestamp": "2025-09-15 03:17:19.007844", "step": 1394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:19.038537", "step": 1394, "epoch": 3 }, { "type": "loss", "content": 0.0001524169056210667, "timestamp": "2025-09-15 03:17:19.042402", "step": 1395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:19.073659", "step": 1395, "epoch": 3 }, { "type": "loss", "content": 0.0001233371876878664, "timestamp": "2025-09-15 03:17:19.098896", "step": 1396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:19.129491", "step": 1396, "epoch": 3 }, { "type": "loss", "content": 0.001133440644480288, "timestamp": "2025-09-15 03:17:19.131592", "step": 1397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:19.161991", "step": 1397, "epoch": 3 }, { "type": "loss", "content": 0.0007659767870791256, "timestamp": "2025-09-15 03:17:19.164197", "step": 1398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:19.194321", "step": 1398, "epoch": 3 }, { "type": "loss", "content": 0.00019041731138713658, "timestamp": "2025-09-15 03:17:19.196888", "step": 1399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:19.227451", "step": 1399, "epoch": 3 }, { "type": "loss", "content": 0.0005006372230127454, "timestamp": "2025-09-15 03:17:19.252419", "step": 1400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:19.282650", "step": 1400, "epoch": 3 }, { "type": "loss", "content": 0.00273110275156796, "timestamp": "2025-09-15 03:17:19.284825", "step": 1401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:19.315783", "step": 1401, "epoch": 3 }, { "type": "loss", "content": 0.00023853448510635644, "timestamp": "2025-09-15 03:17:19.322259", "step": 1402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:19.352756", "step": 1402, "epoch": 3 }, { "type": "loss", "content": 0.0010633624624460936, "timestamp": "2025-09-15 03:17:19.354929", "step": 1403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:19.385636", "step": 1403, "epoch": 3 }, { "type": "loss", "content": 0.0003287098079454154, "timestamp": "2025-09-15 03:17:19.410907", "step": 1404, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:19.997404", "step": 1404, "epoch": 3 }, { "type": "pplx", "content": 125354850.80082554, "timestamp": "2025-09-15 03:17:19.999430", "step": 1404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:20.027669", "step": 1404, "epoch": 3 }, { "type": "loss", "content": 0.002071457216516137, "timestamp": "2025-09-15 03:17:20.030890", "step": 1405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:20.061827", "step": 1405, "epoch": 3 }, { "type": "loss", "content": 0.00012812399654649198, "timestamp": "2025-09-15 03:17:20.068618", "step": 1406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.099609", "step": 1406, "epoch": 3 }, { "type": "loss", "content": 0.0007547488785348833, "timestamp": "2025-09-15 03:17:20.103533", "step": 1407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:20.133573", "step": 1407, "epoch": 3 }, { "type": "loss", "content": 9.140604379354045e-05, "timestamp": "2025-09-15 03:17:20.157184", "step": 1408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:20.187758", "step": 1408, "epoch": 3 }, { "type": "loss", "content": 0.002761248266324401, "timestamp": "2025-09-15 03:17:20.192625", "step": 1409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:20.222838", "step": 1409, "epoch": 3 }, { "type": "loss", "content": 0.0002269747928949073, "timestamp": "2025-09-15 03:17:20.227381", "step": 1410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:20.257969", "step": 1410, "epoch": 3 }, { "type": "loss", "content": 0.00021279798238538206, "timestamp": "2025-09-15 03:17:20.260552", "step": 1411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.290671", "step": 1411, "epoch": 3 }, { "type": "loss", "content": 0.001422856585122645, "timestamp": "2025-09-15 03:17:20.315554", "step": 1412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.345574", "step": 1412, "epoch": 3 }, { "type": "loss", "content": 8.99404039955698e-05, "timestamp": "2025-09-15 03:17:20.347563", "step": 1413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:20.377831", "step": 1413, "epoch": 3 }, { "type": "loss", "content": 0.04837150126695633, "timestamp": "2025-09-15 03:17:20.380351", "step": 1414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:20.410909", "step": 1414, "epoch": 3 }, { "type": "loss", "content": 0.0014580102870240808, "timestamp": "2025-09-15 03:17:20.418169", "step": 1415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.449235", "step": 1415, "epoch": 3 }, { "type": "loss", "content": 0.02324610762298107, "timestamp": "2025-09-15 03:17:20.474197", "step": 1416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:20.504773", "step": 1416, "epoch": 3 }, { "type": "loss", "content": 0.0024212843272835016, "timestamp": "2025-09-15 03:17:20.506889", "step": 1417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:20.537342", "step": 1417, "epoch": 3 }, { "type": "loss", "content": 4.8245903599308804e-05, "timestamp": "2025-09-15 03:17:20.544449", "step": 1418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.575729", "step": 1418, "epoch": 3 }, { "type": "loss", "content": 0.0004255035310052335, "timestamp": "2025-09-15 03:17:20.579672", "step": 1419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:20.609848", "step": 1419, "epoch": 3 }, { "type": "loss", "content": 0.00013181402755435556, "timestamp": "2025-09-15 03:17:20.633286", "step": 1420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:20.663715", "step": 1420, "epoch": 3 }, { "type": "loss", "content": 0.00017817116167861968, "timestamp": "2025-09-15 03:17:20.665800", "step": 1421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:20.696500", "step": 1421, "epoch": 3 }, { "type": "loss", "content": 0.00030947051709517837, "timestamp": "2025-09-15 03:17:20.698821", "step": 1422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:20.730573", "step": 1422, "epoch": 3 }, { "type": "loss", "content": 0.0001780502061592415, "timestamp": "2025-09-15 03:17:20.732659", "step": 1423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:20.763220", "step": 1423, "epoch": 3 }, { "type": "loss", "content": 8.599001739639789e-05, "timestamp": "2025-09-15 03:17:20.786767", "step": 1424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.817233", "step": 1424, "epoch": 3 }, { "type": "loss", "content": 0.00014561659190803766, "timestamp": "2025-09-15 03:17:20.819123", "step": 1425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:20.849211", "step": 1425, "epoch": 3 }, { "type": "loss", "content": 0.00022120056382846087, "timestamp": "2025-09-15 03:17:20.851161", "step": 1426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:20.881931", "step": 1426, "epoch": 3 }, { "type": "loss", "content": 0.00026828559930436313, "timestamp": "2025-09-15 03:17:20.886010", "step": 1427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:20.916519", "step": 1427, "epoch": 3 }, { "type": "loss", "content": 0.002969352062791586, "timestamp": "2025-09-15 03:17:20.944556", "step": 1428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:20.974854", "step": 1428, "epoch": 3 }, { "type": "loss", "content": 0.00018780956452246755, "timestamp": "2025-09-15 03:17:20.976874", "step": 1429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:21.007740", "step": 1429, "epoch": 3 }, { "type": "loss", "content": 0.0001267143670702353, "timestamp": "2025-09-15 03:17:21.010161", "step": 1430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:21.040085", "step": 1430, "epoch": 3 }, { "type": "loss", "content": 0.001236670301295817, "timestamp": "2025-09-15 03:17:21.042101", "step": 1431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:21.073383", "step": 1431, "epoch": 3 }, { "type": "loss", "content": 0.0011406567646190524, "timestamp": "2025-09-15 03:17:21.098157", "step": 1432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:21.128481", "step": 1432, "epoch": 3 }, { "type": "loss", "content": 0.0039399052038788795, "timestamp": "2025-09-15 03:17:21.130508", "step": 1433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:21.161138", "step": 1433, "epoch": 3 }, { "type": "loss", "content": 0.0001970245357370004, "timestamp": "2025-09-15 03:17:21.167836", "step": 1434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:21.198362", "step": 1434, "epoch": 3 }, { "type": "loss", "content": 0.0003503093321342021, "timestamp": "2025-09-15 03:17:21.202514", "step": 1435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:21.232623", "step": 1435, "epoch": 3 }, { "type": "loss", "content": 9.151557605946437e-05, "timestamp": "2025-09-15 03:17:21.257529", "step": 1436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:21.287382", "step": 1436, "epoch": 3 }, { "type": "loss", "content": 0.00021274581376928836, "timestamp": "2025-09-15 03:17:21.289446", "step": 1437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:21.319495", "step": 1437, "epoch": 3 }, { "type": "loss", "content": 0.0023747701197862625, "timestamp": "2025-09-15 03:17:21.321406", "step": 1438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:21.351725", "step": 1438, "epoch": 3 }, { "type": "loss", "content": 0.00023102053091861308, "timestamp": "2025-09-15 03:17:21.354202", "step": 1439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:21.384941", "step": 1439, "epoch": 3 }, { "type": "loss", "content": 0.0003894390829373151, "timestamp": "2025-09-15 03:17:21.409765", "step": 1440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:21.439986", "step": 1440, "epoch": 3 }, { "type": "loss", "content": 5.9962829254800454e-05, "timestamp": "2025-09-15 03:17:21.441978", "step": 1441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:21.473888", "step": 1441, "epoch": 3 }, { "type": "loss", "content": 0.011845647357404232, "timestamp": "2025-09-15 03:17:21.478136", "step": 1442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:21.512786", "step": 1442, "epoch": 3 }, { "type": "loss", "content": 0.0003542363701853901, "timestamp": "2025-09-15 03:17:21.517346", "step": 1443, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:22.125497", "step": 1443, "epoch": 3 }, { "type": "pplx", "content": 126089176.76405717, "timestamp": "2025-09-15 03:17:22.127467", "step": 1443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:22.156150", "step": 1443, "epoch": 3 }, { "type": "loss", "content": 8.075464575085789e-05, "timestamp": "2025-09-15 03:17:22.186025", "step": 1444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:22.216635", "step": 1444, "epoch": 3 }, { "type": "loss", "content": 6.827951438026503e-05, "timestamp": "2025-09-15 03:17:22.223525", "step": 1445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:22.254632", "step": 1445, "epoch": 3 }, { "type": "loss", "content": 0.00014688474766444415, "timestamp": "2025-09-15 03:17:22.260996", "step": 1446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:22.293658", "step": 1446, "epoch": 3 }, { "type": "loss", "content": 0.00030845263972878456, "timestamp": "2025-09-15 03:17:22.300301", "step": 1447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:22.330668", "step": 1447, "epoch": 3 }, { "type": "loss", "content": 0.00010615136852720752, "timestamp": "2025-09-15 03:17:22.355509", "step": 1448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:22.385780", "step": 1448, "epoch": 3 }, { "type": "loss", "content": 0.0005661295726895332, "timestamp": "2025-09-15 03:17:22.387759", "step": 1449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:22.425443", "step": 1449, "epoch": 3 }, { "type": "loss", "content": 0.040414344519376755, "timestamp": "2025-09-15 03:17:22.427588", "step": 1450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:22.457225", "step": 1450, "epoch": 3 }, { "type": "loss", "content": 8.917743980418891e-05, "timestamp": "2025-09-15 03:17:22.459330", "step": 1451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:22.491053", "step": 1451, "epoch": 3 }, { "type": "loss", "content": 0.0008629755466245115, "timestamp": "2025-09-15 03:17:22.514575", "step": 1452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:22.545058", "step": 1452, "epoch": 3 }, { "type": "loss", "content": 0.009162337519228458, "timestamp": "2025-09-15 03:17:22.547113", "step": 1453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:22.578072", "step": 1453, "epoch": 3 }, { "type": "loss", "content": 0.002335967728868127, "timestamp": "2025-09-15 03:17:22.584799", "step": 1454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:22.615019", "step": 1454, "epoch": 3 }, { "type": "loss", "content": 0.0006352242780849338, "timestamp": "2025-09-15 03:17:22.617709", "step": 1455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:22.648379", "step": 1455, "epoch": 3 }, { "type": "loss", "content": 0.0012936294078826904, "timestamp": "2025-09-15 03:17:22.671970", "step": 1456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:22.703783", "step": 1456, "epoch": 3 }, { "type": "loss", "content": 0.0026901266537606716, "timestamp": "2025-09-15 03:17:22.707835", "step": 1457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:22.738628", "step": 1457, "epoch": 3 }, { "type": "loss", "content": 0.0022749509662389755, "timestamp": "2025-09-15 03:17:22.741020", "step": 1458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:22.771416", "step": 1458, "epoch": 3 }, { "type": "loss", "content": 0.0003408204356674105, "timestamp": "2025-09-15 03:17:22.773491", "step": 1459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:22.805075", "step": 1459, "epoch": 3 }, { "type": "loss", "content": 0.008805022574961185, "timestamp": "2025-09-15 03:17:22.830041", "step": 1460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:22.860421", "step": 1460, "epoch": 3 }, { "type": "loss", "content": 9.171007695840672e-05, "timestamp": "2025-09-15 03:17:22.862738", "step": 1461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:22.893515", "step": 1461, "epoch": 3 }, { "type": "loss", "content": 7.348069630097598e-05, "timestamp": "2025-09-15 03:17:22.895796", "step": 1462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:22.926014", "step": 1462, "epoch": 3 }, { "type": "loss", "content": 0.00017722553457133472, "timestamp": "2025-09-15 03:17:22.928111", "step": 1463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:22.958789", "step": 1463, "epoch": 3 }, { "type": "loss", "content": 0.00032406425452791154, "timestamp": "2025-09-15 03:17:22.987286", "step": 1464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:23.017892", "step": 1464, "epoch": 3 }, { "type": "loss", "content": 0.00012766069266945124, "timestamp": "2025-09-15 03:17:23.020001", "step": 1465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:23.050747", "step": 1465, "epoch": 3 }, { "type": "loss", "content": 0.00027193533605895936, "timestamp": "2025-09-15 03:17:23.058292", "step": 1466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:23.088702", "step": 1466, "epoch": 3 }, { "type": "loss", "content": 5.195228732191026e-05, "timestamp": "2025-09-15 03:17:23.092907", "step": 1467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:23.124010", "step": 1467, "epoch": 3 }, { "type": "loss", "content": 0.002318126615136862, "timestamp": "2025-09-15 03:17:23.155314", "step": 1468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:23.186314", "step": 1468, "epoch": 3 }, { "type": "loss", "content": 0.0008258981979452074, "timestamp": "2025-09-15 03:17:23.190907", "step": 1469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:23.221516", "step": 1469, "epoch": 3 }, { "type": "loss", "content": 0.0005010002059862018, "timestamp": "2025-09-15 03:17:23.228134", "step": 1470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:23.258183", "step": 1470, "epoch": 3 }, { "type": "loss", "content": 0.006026547867804766, "timestamp": "2025-09-15 03:17:23.260299", "step": 1471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:23.290785", "step": 1471, "epoch": 3 }, { "type": "loss", "content": 0.00020711333490908146, "timestamp": "2025-09-15 03:17:23.314496", "step": 1472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:23.345215", "step": 1472, "epoch": 3 }, { "type": "loss", "content": 0.0006892742239870131, "timestamp": "2025-09-15 03:17:23.347304", "step": 1473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:23.377211", "step": 1473, "epoch": 3 }, { "type": "loss", "content": 9.77161034825258e-05, "timestamp": "2025-09-15 03:17:23.379297", "step": 1474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:23.410623", "step": 1474, "epoch": 3 }, { "type": "loss", "content": 4.4186213926877826e-05, "timestamp": "2025-09-15 03:17:23.417619", "step": 1475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:23.448364", "step": 1475, "epoch": 3 }, { "type": "loss", "content": 0.018671514466404915, "timestamp": "2025-09-15 03:17:23.472111", "step": 1476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:23.503974", "step": 1476, "epoch": 3 }, { "type": "loss", "content": 0.00743169104680419, "timestamp": "2025-09-15 03:17:23.505979", "step": 1477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:23.536380", "step": 1477, "epoch": 3 }, { "type": "loss", "content": 0.00011039253149647266, "timestamp": "2025-09-15 03:17:23.543811", "step": 1478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:23.574266", "step": 1478, "epoch": 3 }, { "type": "loss", "content": 0.0047522238455712795, "timestamp": "2025-09-15 03:17:23.581681", "step": 1479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:23.613471", "step": 1479, "epoch": 3 }, { "type": "loss", "content": 0.00016252427303697914, "timestamp": "2025-09-15 03:17:23.636942", "step": 1480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:23.667421", "step": 1480, "epoch": 3 }, { "type": "loss", "content": 0.000923981424421072, "timestamp": "2025-09-15 03:17:23.669863", "step": 1481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:23.700883", "step": 1481, "epoch": 3 }, { "type": "loss", "content": 0.00042834514169953763, "timestamp": "2025-09-15 03:17:23.708462", "step": 1482, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:24.297591", "step": 1482, "epoch": 3 }, { "type": "pplx", "content": 126383143.71461472, "timestamp": "2025-09-15 03:17:24.299474", "step": 1482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:24.328575", "step": 1482, "epoch": 3 }, { "type": "loss", "content": 0.0013859715545549989, "timestamp": "2025-09-15 03:17:24.332626", "step": 1483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:24.363124", "step": 1483, "epoch": 3 }, { "type": "loss", "content": 0.0002539039996918291, "timestamp": "2025-09-15 03:17:24.388304", "step": 1484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:24.418678", "step": 1484, "epoch": 3 }, { "type": "loss", "content": 0.018063468858599663, "timestamp": "2025-09-15 03:17:24.420936", "step": 1485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:24.452061", "step": 1485, "epoch": 3 }, { "type": "loss", "content": 0.00024127533833961934, "timestamp": "2025-09-15 03:17:24.456417", "step": 1486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:24.487198", "step": 1486, "epoch": 3 }, { "type": "loss", "content": 0.00022271994384936988, "timestamp": "2025-09-15 03:17:24.494067", "step": 1487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:24.524540", "step": 1487, "epoch": 3 }, { "type": "loss", "content": 0.00039580193697474897, "timestamp": "2025-09-15 03:17:24.549530", "step": 1488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:24.580906", "step": 1488, "epoch": 3 }, { "type": "loss", "content": 0.007025205530226231, "timestamp": "2025-09-15 03:17:24.585235", "step": 1489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:24.616611", "step": 1489, "epoch": 3 }, { "type": "loss", "content": 0.0018606951925903559, "timestamp": "2025-09-15 03:17:24.621126", "step": 1490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:24.653171", "step": 1490, "epoch": 3 }, { "type": "loss", "content": 0.0014626128831878304, "timestamp": "2025-09-15 03:17:24.659765", "step": 1491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:24.691466", "step": 1491, "epoch": 3 }, { "type": "loss", "content": 0.00030919170239940286, "timestamp": "2025-09-15 03:17:24.719695", "step": 1492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:24.751431", "step": 1492, "epoch": 3 }, { "type": "loss", "content": 7.744396134512499e-05, "timestamp": "2025-09-15 03:17:24.753810", "step": 1493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:24.785352", "step": 1493, "epoch": 3 }, { "type": "loss", "content": 0.005347420461475849, "timestamp": "2025-09-15 03:17:24.789035", "step": 1494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:24.827759", "step": 1494, "epoch": 3 }, { "type": "loss", "content": 3.8123798731248826e-05, "timestamp": "2025-09-15 03:17:24.833829", "step": 1495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:24.863895", "step": 1495, "epoch": 3 }, { "type": "loss", "content": 5.30069628439378e-05, "timestamp": "2025-09-15 03:17:24.887515", "step": 1496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:24.918993", "step": 1496, "epoch": 3 }, { "type": "loss", "content": 8.05064119049348e-05, "timestamp": "2025-09-15 03:17:24.921006", "step": 1497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:24.951675", "step": 1497, "epoch": 3 }, { "type": "loss", "content": 0.0006126973894424736, "timestamp": "2025-09-15 03:17:24.953857", "step": 1498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:24.985111", "step": 1498, "epoch": 3 }, { "type": "loss", "content": 0.0006413686205632985, "timestamp": "2025-09-15 03:17:24.987174", "step": 1499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:25.017442", "step": 1499, "epoch": 3 }, { "type": "loss", "content": 0.00629863515496254, "timestamp": "2025-09-15 03:17:25.041021", "step": 1500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-09-15 03:17:31.238254", "step": 1500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:31.273356", "step": 1500, "epoch": 3 }, { "type": "loss", "content": 0.020123621448874474, "timestamp": "2025-09-15 03:17:31.275565", "step": 1501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:31.306674", "step": 1501, "epoch": 3 }, { "type": "loss", "content": 0.0001477115583838895, "timestamp": "2025-09-15 03:17:31.308707", "step": 1502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:31.339513", "step": 1502, "epoch": 3 }, { "type": "loss", "content": 0.00033375757629983127, "timestamp": "2025-09-15 03:17:31.345785", "step": 1503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:31.377326", "step": 1503, "epoch": 3 }, { "type": "loss", "content": 4.395996074890718e-05, "timestamp": "2025-09-15 03:17:31.405520", "step": 1504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:31.436569", "step": 1504, "epoch": 3 }, { "type": "loss", "content": 0.008604790084064007, "timestamp": "2025-09-15 03:17:31.438631", "step": 1505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:31.468975", "step": 1505, "epoch": 3 }, { "type": "loss", "content": 0.00032148772152140737, "timestamp": "2025-09-15 03:17:31.475518", "step": 1506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:31.507445", "step": 1506, "epoch": 3 }, { "type": "loss", "content": 0.00021968342480249703, "timestamp": "2025-09-15 03:17:31.509731", "step": 1507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:31.540464", "step": 1507, "epoch": 3 }, { "type": "loss", "content": 0.0002701000776141882, "timestamp": "2025-09-15 03:17:31.564206", "step": 1508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:31.594640", "step": 1508, "epoch": 3 }, { "type": "loss", "content": 0.00018474875832907856, "timestamp": "2025-09-15 03:17:31.596720", "step": 1509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:31.626769", "step": 1509, "epoch": 3 }, { "type": "loss", "content": 0.00023964645515661687, "timestamp": "2025-09-15 03:17:31.629305", "step": 1510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:31.659628", "step": 1510, "epoch": 3 }, { "type": "loss", "content": 0.00017940135148819536, "timestamp": "2025-09-15 03:17:31.661917", "step": 1511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:31.692443", "step": 1511, "epoch": 3 }, { "type": "loss", "content": 0.0011125266319140792, "timestamp": "2025-09-15 03:17:31.716053", "step": 1512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:31.747264", "step": 1512, "epoch": 3 }, { "type": "loss", "content": 4.7944111429387704e-05, "timestamp": "2025-09-15 03:17:31.749380", "step": 1513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:31.779733", "step": 1513, "epoch": 3 }, { "type": "loss", "content": 0.0005373401218093932, "timestamp": "2025-09-15 03:17:31.782096", "step": 1514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:31.812540", "step": 1514, "epoch": 3 }, { "type": "loss", "content": 0.00032914456096477807, "timestamp": "2025-09-15 03:17:31.816560", "step": 1515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:31.847803", "step": 1515, "epoch": 3 }, { "type": "loss", "content": 5.4149852076079696e-05, "timestamp": "2025-09-15 03:17:31.871442", "step": 1516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:31.902172", "step": 1516, "epoch": 3 }, { "type": "loss", "content": 0.001020874478854239, "timestamp": "2025-09-15 03:17:31.904196", "step": 1517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:31.936442", "step": 1517, "epoch": 3 }, { "type": "loss", "content": 0.0020698870066553354, "timestamp": "2025-09-15 03:17:31.942997", "step": 1518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:31.975035", "step": 1518, "epoch": 3 }, { "type": "loss", "content": 0.0002697810996323824, "timestamp": "2025-09-15 03:17:31.982535", "step": 1519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:32.013987", "step": 1519, "epoch": 3 }, { "type": "loss", "content": 6.943456537555903e-05, "timestamp": "2025-09-15 03:17:32.041704", "step": 1520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:32.072340", "step": 1520, "epoch": 3 }, { "type": "loss", "content": 0.00020672274695243686, "timestamp": "2025-09-15 03:17:32.074605", "step": 1521, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:32.663987", "step": 1521, "epoch": 3 }, { "type": "pplx", "content": 125235580.84444861, "timestamp": "2025-09-15 03:17:32.665675", "step": 1521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:32.695624", "step": 1521, "epoch": 3 }, { "type": "loss", "content": 5.122490256326273e-05, "timestamp": "2025-09-15 03:17:32.704972", "step": 1522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:32.736033", "step": 1522, "epoch": 3 }, { "type": "loss", "content": 0.00010624745482346043, "timestamp": "2025-09-15 03:17:32.738523", "step": 1523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:32.771574", "step": 1523, "epoch": 3 }, { "type": "loss", "content": 0.00021359531092457473, "timestamp": "2025-09-15 03:17:32.799726", "step": 1524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:32.831616", "step": 1524, "epoch": 3 }, { "type": "loss", "content": 0.0006508774240501225, "timestamp": "2025-09-15 03:17:32.833852", "step": 1525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:32.865074", "step": 1525, "epoch": 3 }, { "type": "loss", "content": 0.0001420353801222518, "timestamp": "2025-09-15 03:17:32.868944", "step": 1526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:32.899428", "step": 1526, "epoch": 3 }, { "type": "loss", "content": 0.0002659276651684195, "timestamp": "2025-09-15 03:17:32.903686", "step": 1527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:32.934591", "step": 1527, "epoch": 3 }, { "type": "loss", "content": 0.0002821832022164017, "timestamp": "2025-09-15 03:17:32.958272", "step": 1528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:32.988858", "step": 1528, "epoch": 3 }, { "type": "loss", "content": 0.0001353594707325101, "timestamp": "2025-09-15 03:17:32.994163", "step": 1529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:17:33.025699", "step": 1529, "epoch": 3 }, { "type": "loss", "content": 0.00012022336886730045, "timestamp": "2025-09-15 03:17:33.037465", "step": 1530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.068693", "step": 1530, "epoch": 3 }, { "type": "loss", "content": 8.694497955730185e-05, "timestamp": "2025-09-15 03:17:33.072782", "step": 1531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:33.104407", "step": 1531, "epoch": 3 }, { "type": "loss", "content": 0.0010885964147746563, "timestamp": "2025-09-15 03:17:33.131984", "step": 1532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:33.162936", "step": 1532, "epoch": 3 }, { "type": "loss", "content": 9.456718544242904e-05, "timestamp": "2025-09-15 03:17:33.167019", "step": 1533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.198015", "step": 1533, "epoch": 3 }, { "type": "loss", "content": 7.467066461686045e-05, "timestamp": "2025-09-15 03:17:33.201926", "step": 1534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:33.234017", "step": 1534, "epoch": 3 }, { "type": "loss", "content": 0.00028563832165673375, "timestamp": "2025-09-15 03:17:33.237925", "step": 1535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.268366", "step": 1535, "epoch": 3 }, { "type": "loss", "content": 0.0002187574136769399, "timestamp": "2025-09-15 03:17:33.293694", "step": 1536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:33.324069", "step": 1536, "epoch": 3 }, { "type": "loss", "content": 0.00011100248229922727, "timestamp": "2025-09-15 03:17:33.326344", "step": 1537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:33.358555", "step": 1537, "epoch": 3 }, { "type": "loss", "content": 8.833393076201901e-05, "timestamp": "2025-09-15 03:17:33.361330", "step": 1538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:33.391782", "step": 1538, "epoch": 3 }, { "type": "loss", "content": 0.0005231335526332259, "timestamp": "2025-09-15 03:17:33.394097", "step": 1539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:33.424023", "step": 1539, "epoch": 3 }, { "type": "loss", "content": 8.783872181084007e-05, "timestamp": "2025-09-15 03:17:33.447675", "step": 1540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:33.478658", "step": 1540, "epoch": 3 }, { "type": "loss", "content": 4.987287684343755e-05, "timestamp": "2025-09-15 03:17:33.482716", "step": 1541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:33.512866", "step": 1541, "epoch": 3 }, { "type": "loss", "content": 0.0007524410611949861, "timestamp": "2025-09-15 03:17:33.514978", "step": 1542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:33.545420", "step": 1542, "epoch": 3 }, { "type": "loss", "content": 0.00011115191591670737, "timestamp": "2025-09-15 03:17:33.547457", "step": 1543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.577893", "step": 1543, "epoch": 3 }, { "type": "loss", "content": 9.302215039497241e-05, "timestamp": "2025-09-15 03:17:33.602913", "step": 1544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:33.634182", "step": 1544, "epoch": 3 }, { "type": "loss", "content": 4.7555680794175714e-05, "timestamp": "2025-09-15 03:17:33.636238", "step": 1545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:33.666896", "step": 1545, "epoch": 3 }, { "type": "loss", "content": 0.0001348310906905681, "timestamp": "2025-09-15 03:17:33.673475", "step": 1546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.704241", "step": 1546, "epoch": 3 }, { "type": "loss", "content": 0.00013676950766239315, "timestamp": "2025-09-15 03:17:33.708405", "step": 1547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:33.739109", "step": 1547, "epoch": 3 }, { "type": "loss", "content": 0.001560694188810885, "timestamp": "2025-09-15 03:17:33.763827", "step": 1548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:33.794886", "step": 1548, "epoch": 3 }, { "type": "loss", "content": 0.003178370650857687, "timestamp": "2025-09-15 03:17:33.796961", "step": 1549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:33.827609", "step": 1549, "epoch": 3 }, { "type": "loss", "content": 9.050655353348702e-05, "timestamp": "2025-09-15 03:17:33.831261", "step": 1550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:33.863106", "step": 1550, "epoch": 3 }, { "type": "loss", "content": 8.560741844121367e-05, "timestamp": "2025-09-15 03:17:33.865271", "step": 1551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:33.896231", "step": 1551, "epoch": 3 }, { "type": "loss", "content": 3.5383240174269304e-05, "timestamp": "2025-09-15 03:17:33.923540", "step": 1552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:33.954491", "step": 1552, "epoch": 3 }, { "type": "loss", "content": 0.00016244736616499722, "timestamp": "2025-09-15 03:17:33.958507", "step": 1553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:33.989745", "step": 1553, "epoch": 3 }, { "type": "loss", "content": 6.825554737588391e-05, "timestamp": "2025-09-15 03:17:33.996237", "step": 1554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:34.027115", "step": 1554, "epoch": 3 }, { "type": "loss", "content": 6.661610677838326e-05, "timestamp": "2025-09-15 03:17:34.030857", "step": 1555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:34.061911", "step": 1555, "epoch": 3 }, { "type": "loss", "content": 0.0005459814565256238, "timestamp": "2025-09-15 03:17:34.089428", "step": 1556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:34.120076", "step": 1556, "epoch": 3 }, { "type": "loss", "content": 9.487460920354351e-05, "timestamp": "2025-09-15 03:17:34.122130", "step": 1557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:34.152569", "step": 1557, "epoch": 3 }, { "type": "loss", "content": 0.0005464908317662776, "timestamp": "2025-09-15 03:17:34.154995", "step": 1558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:34.190420", "step": 1558, "epoch": 3 }, { "type": "loss", "content": 0.0005193923716433346, "timestamp": "2025-09-15 03:17:34.193712", "step": 1559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:34.230142", "step": 1559, "epoch": 3 }, { "type": "loss", "content": 0.00015628755500074476, "timestamp": "2025-09-15 03:17:34.253828", "step": 1560, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:34.844433", "step": 1560, "epoch": 3 }, { "type": "pplx", "content": 127046711.74826264, "timestamp": "2025-09-15 03:17:34.846766", "step": 1560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:34.875719", "step": 1560, "epoch": 3 }, { "type": "loss", "content": 0.00029895914485678077, "timestamp": "2025-09-15 03:17:34.877717", "step": 1561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:34.908207", "step": 1561, "epoch": 3 }, { "type": "loss", "content": 0.000372684356989339, "timestamp": "2025-09-15 03:17:34.912332", "step": 1562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:34.942547", "step": 1562, "epoch": 3 }, { "type": "loss", "content": 0.00044768728548660874, "timestamp": "2025-09-15 03:17:34.944641", "step": 1563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:34.975107", "step": 1563, "epoch": 3 }, { "type": "loss", "content": 0.0016328077763319016, "timestamp": "2025-09-15 03:17:34.998711", "step": 1564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:35.028816", "step": 1564, "epoch": 3 }, { "type": "loss", "content": 0.06172548979520798, "timestamp": "2025-09-15 03:17:35.031046", "step": 1565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:35.061231", "step": 1565, "epoch": 3 }, { "type": "loss", "content": 8.873850310919806e-05, "timestamp": "2025-09-15 03:17:35.063414", "step": 1566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:35.093886", "step": 1566, "epoch": 3 }, { "type": "loss", "content": 0.0033889301121234894, "timestamp": "2025-09-15 03:17:35.097818", "step": 1567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:35.130488", "step": 1567, "epoch": 3 }, { "type": "loss", "content": 0.00027142881299369037, "timestamp": "2025-09-15 03:17:35.158736", "step": 1568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:35.189673", "step": 1568, "epoch": 3 }, { "type": "loss", "content": 0.0022615992929786444, "timestamp": "2025-09-15 03:17:35.191772", "step": 1569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:35.222456", "step": 1569, "epoch": 3 }, { "type": "loss", "content": 0.000650630914606154, "timestamp": "2025-09-15 03:17:35.226391", "step": 1570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:35.257331", "step": 1570, "epoch": 3 }, { "type": "loss", "content": 5.338179835234769e-05, "timestamp": "2025-09-15 03:17:35.261510", "step": 1571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:35.292670", "step": 1571, "epoch": 3 }, { "type": "loss", "content": 3.176413520122878e-05, "timestamp": "2025-09-15 03:17:35.317451", "step": 1572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:35.347856", "step": 1572, "epoch": 3 }, { "type": "loss", "content": 3.908006692654453e-05, "timestamp": "2025-09-15 03:17:35.350192", "step": 1573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:35.380146", "step": 1573, "epoch": 3 }, { "type": "loss", "content": 3.330052277306095e-05, "timestamp": "2025-09-15 03:17:35.382355", "step": 1574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:35.412864", "step": 1574, "epoch": 3 }, { "type": "loss", "content": 4.1809329559328035e-05, "timestamp": "2025-09-15 03:17:35.420352", "step": 1575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:35.450831", "step": 1575, "epoch": 3 }, { "type": "loss", "content": 0.00010141608800040558, "timestamp": "2025-09-15 03:17:35.474429", "step": 1576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:35.504856", "step": 1576, "epoch": 3 }, { "type": "loss", "content": 0.0002838522777892649, "timestamp": "2025-09-15 03:17:35.506933", "step": 1577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:35.537208", "step": 1577, "epoch": 3 }, { "type": "loss", "content": 0.00032633551745675504, "timestamp": "2025-09-15 03:17:35.539265", "step": 1578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:35.570627", "step": 1578, "epoch": 3 }, { "type": "loss", "content": 6.84321130393073e-05, "timestamp": "2025-09-15 03:17:35.577585", "step": 1579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:35.608101", "step": 1579, "epoch": 3 }, { "type": "loss", "content": 0.0007345884805545211, "timestamp": "2025-09-15 03:17:35.631677", "step": 1580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:35.662201", "step": 1580, "epoch": 3 }, { "type": "loss", "content": 0.0004944135434925556, "timestamp": "2025-09-15 03:17:35.664209", "step": 1581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:35.694173", "step": 1581, "epoch": 3 }, { "type": "loss", "content": 6.49203357170336e-05, "timestamp": "2025-09-15 03:17:35.696214", "step": 1582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:35.726693", "step": 1582, "epoch": 3 }, { "type": "loss", "content": 3.8760634197387844e-05, "timestamp": "2025-09-15 03:17:35.730028", "step": 1583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:35.760249", "step": 1583, "epoch": 3 }, { "type": "loss", "content": 0.00014414360339287668, "timestamp": "2025-09-15 03:17:35.783847", "step": 1584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:35.814239", "step": 1584, "epoch": 3 }, { "type": "loss", "content": 0.00013764832692686468, "timestamp": "2025-09-15 03:17:35.816187", "step": 1585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:35.846022", "step": 1585, "epoch": 3 }, { "type": "loss", "content": 6.031978045939468e-05, "timestamp": "2025-09-15 03:17:35.848592", "step": 1586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:35.880434", "step": 1586, "epoch": 3 }, { "type": "loss", "content": 5.813813186250627e-05, "timestamp": "2025-09-15 03:17:35.887020", "step": 1587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:35.916803", "step": 1587, "epoch": 3 }, { "type": "loss", "content": 8.35294122225605e-05, "timestamp": "2025-09-15 03:17:35.940522", "step": 1588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:35.971881", "step": 1588, "epoch": 3 }, { "type": "loss", "content": 0.0017162241274490952, "timestamp": "2025-09-15 03:17:35.973874", "step": 1589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:36.004162", "step": 1589, "epoch": 3 }, { "type": "loss", "content": 0.016213053837418556, "timestamp": "2025-09-15 03:17:36.006677", "step": 1590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:36.038240", "step": 1590, "epoch": 3 }, { "type": "loss", "content": 5.837536446051672e-05, "timestamp": "2025-09-15 03:17:36.044716", "step": 1591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:36.074986", "step": 1591, "epoch": 3 }, { "type": "loss", "content": 0.0015365114668384194, "timestamp": "2025-09-15 03:17:36.099835", "step": 1592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:36.130061", "step": 1592, "epoch": 3 }, { "type": "loss", "content": 0.0001333472173428163, "timestamp": "2025-09-15 03:17:36.133005", "step": 1593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:36.165253", "step": 1593, "epoch": 3 }, { "type": "loss", "content": 2.8431244572857395e-05, "timestamp": "2025-09-15 03:17:36.169276", "step": 1594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:36.199455", "step": 1594, "epoch": 3 }, { "type": "loss", "content": 4.7248369810404256e-05, "timestamp": "2025-09-15 03:17:36.201498", "step": 1595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:36.232692", "step": 1595, "epoch": 3 }, { "type": "loss", "content": 7.492154691135511e-05, "timestamp": "2025-09-15 03:17:36.260500", "step": 1596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:36.290677", "step": 1596, "epoch": 3 }, { "type": "loss", "content": 6.914538244018331e-05, "timestamp": "2025-09-15 03:17:36.292803", "step": 1597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:36.323646", "step": 1597, "epoch": 3 }, { "type": "loss", "content": 0.0001433978322893381, "timestamp": "2025-09-15 03:17:36.330323", "step": 1598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:36.360701", "step": 1598, "epoch": 3 }, { "type": "loss", "content": 0.0006194100715219975, "timestamp": "2025-09-15 03:17:36.362701", "step": 1599, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:36.957874", "step": 1599, "epoch": 3 }, { "type": "pplx", "content": 122925884.74202192, "timestamp": "2025-09-15 03:17:36.959624", "step": 1599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:36.988130", "step": 1599, "epoch": 3 }, { "type": "loss", "content": 4.66671226604376e-05, "timestamp": "2025-09-15 03:17:37.011830", "step": 1600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:37.043243", "step": 1600, "epoch": 3 }, { "type": "loss", "content": 0.00011773057485697791, "timestamp": "2025-09-15 03:17:37.046501", "step": 1601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:37.084855", "step": 1601, "epoch": 3 }, { "type": "loss", "content": 0.001495333737693727, "timestamp": "2025-09-15 03:17:37.094864", "step": 1602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:37.126743", "step": 1602, "epoch": 3 }, { "type": "loss", "content": 0.00011041124525945634, "timestamp": "2025-09-15 03:17:37.129126", "step": 1603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:37.159428", "step": 1603, "epoch": 3 }, { "type": "loss", "content": 5.4903135605854914e-05, "timestamp": "2025-09-15 03:17:37.183095", "step": 1604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:37.213306", "step": 1604, "epoch": 3 }, { "type": "loss", "content": 9.56984149524942e-05, "timestamp": "2025-09-15 03:17:37.216111", "step": 1605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:37.247474", "step": 1605, "epoch": 3 }, { "type": "loss", "content": 2.949278859887272e-05, "timestamp": "2025-09-15 03:17:37.254999", "step": 1606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:37.288946", "step": 1606, "epoch": 3 }, { "type": "loss", "content": 0.00021853976068086922, "timestamp": "2025-09-15 03:17:37.291447", "step": 1607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:37.330739", "step": 1607, "epoch": 3 }, { "type": "loss", "content": 0.00010619118984322995, "timestamp": "2025-09-15 03:17:37.361134", "step": 1608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:37.392648", "step": 1608, "epoch": 3 }, { "type": "loss", "content": 0.0009482751484028995, "timestamp": "2025-09-15 03:17:37.398185", "step": 1609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:37.438540", "step": 1609, "epoch": 3 }, { "type": "loss", "content": 0.01618199609220028, "timestamp": "2025-09-15 03:17:37.440563", "step": 1610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:37.471787", "step": 1610, "epoch": 3 }, { "type": "loss", "content": 3.588308027246967e-05, "timestamp": "2025-09-15 03:17:37.474084", "step": 1611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:37.506528", "step": 1611, "epoch": 3 }, { "type": "loss", "content": 4.4485444959718734e-05, "timestamp": "2025-09-15 03:17:37.529941", "step": 1612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:37.560595", "step": 1612, "epoch": 3 }, { "type": "loss", "content": 0.00016524593229405582, "timestamp": "2025-09-15 03:17:37.562649", "step": 1613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:37.592853", "step": 1613, "epoch": 3 }, { "type": "loss", "content": 0.00020557189418468624, "timestamp": "2025-09-15 03:17:37.597162", "step": 1614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:37.627512", "step": 1614, "epoch": 3 }, { "type": "loss", "content": 0.002822331851348281, "timestamp": "2025-09-15 03:17:37.629930", "step": 1615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:37.660540", "step": 1615, "epoch": 3 }, { "type": "loss", "content": 0.00014614405517932028, "timestamp": "2025-09-15 03:17:37.685416", "step": 1616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:37.715651", "step": 1616, "epoch": 3 }, { "type": "loss", "content": 0.0031574785243719816, "timestamp": "2025-09-15 03:17:37.718353", "step": 1617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:37.748758", "step": 1617, "epoch": 3 }, { "type": "loss", "content": 3.237018609070219e-05, "timestamp": "2025-09-15 03:17:37.750827", "step": 1618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:37.780957", "step": 1618, "epoch": 3 }, { "type": "loss", "content": 0.00010914222366409376, "timestamp": "2025-09-15 03:17:37.782704", "step": 1619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:37.812760", "step": 1619, "epoch": 3 }, { "type": "loss", "content": 2.9762662961729802e-05, "timestamp": "2025-09-15 03:17:37.836095", "step": 1620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:37.865983", "step": 1620, "epoch": 3 }, { "type": "loss", "content": 0.0014514749636873603, "timestamp": "2025-09-15 03:17:37.868050", "step": 1621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:37.898253", "step": 1621, "epoch": 3 }, { "type": "loss", "content": 0.004247348755598068, "timestamp": "2025-09-15 03:17:37.904837", "step": 1622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:37.936106", "step": 1622, "epoch": 3 }, { "type": "loss", "content": 0.00014622985327150673, "timestamp": "2025-09-15 03:17:37.940361", "step": 1623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:37.972645", "step": 1623, "epoch": 3 }, { "type": "loss", "content": 0.0014752390561625361, "timestamp": "2025-09-15 03:17:38.000748", "step": 1624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:38.031819", "step": 1624, "epoch": 3 }, { "type": "loss", "content": 0.0016322402516379952, "timestamp": "2025-09-15 03:17:38.033832", "step": 1625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:38.064419", "step": 1625, "epoch": 3 }, { "type": "loss", "content": 0.0025306600145995617, "timestamp": "2025-09-15 03:17:38.066466", "step": 1626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:38.098091", "step": 1626, "epoch": 3 }, { "type": "loss", "content": 0.00017592271615285426, "timestamp": "2025-09-15 03:17:38.100110", "step": 1627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:38.130791", "step": 1627, "epoch": 3 }, { "type": "loss", "content": 0.00015544805501122028, "timestamp": "2025-09-15 03:17:38.158302", "step": 1628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:38.190106", "step": 1628, "epoch": 3 }, { "type": "loss", "content": 0.00015891435032244772, "timestamp": "2025-09-15 03:17:38.194633", "step": 1629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:38.225390", "step": 1629, "epoch": 3 }, { "type": "loss", "content": 0.00015892648661974818, "timestamp": "2025-09-15 03:17:38.232185", "step": 1630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:38.262894", "step": 1630, "epoch": 3 }, { "type": "loss", "content": 0.00010154654592042789, "timestamp": "2025-09-15 03:17:38.264856", "step": 1631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:38.295288", "step": 1631, "epoch": 3 }, { "type": "loss", "content": 5.215726196183823e-05, "timestamp": "2025-09-15 03:17:38.323422", "step": 1632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:38.353612", "step": 1632, "epoch": 3 }, { "type": "loss", "content": 9.126593067776412e-05, "timestamp": "2025-09-15 03:17:38.355599", "step": 1633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:38.385887", "step": 1633, "epoch": 3 }, { "type": "loss", "content": 0.0001367527002003044, "timestamp": "2025-09-15 03:17:38.388311", "step": 1634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:38.418447", "step": 1634, "epoch": 3 }, { "type": "loss", "content": 7.14808120392263e-05, "timestamp": "2025-09-15 03:17:38.420579", "step": 1635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:38.451590", "step": 1635, "epoch": 3 }, { "type": "loss", "content": 9.053764370037243e-05, "timestamp": "2025-09-15 03:17:38.478913", "step": 1636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:38.508769", "step": 1636, "epoch": 3 }, { "type": "loss", "content": 0.000221226378926076, "timestamp": "2025-09-15 03:17:38.510717", "step": 1637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:38.541352", "step": 1637, "epoch": 3 }, { "type": "loss", "content": 0.00017531040066387504, "timestamp": "2025-09-15 03:17:38.543386", "step": 1638, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:39.127497", "step": 1638, "epoch": 3 }, { "type": "pplx", "content": 129207242.84349388, "timestamp": "2025-09-15 03:17:39.129391", "step": 1638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:39.158372", "step": 1638, "epoch": 3 }, { "type": "loss", "content": 2.8826221750932746e-05, "timestamp": "2025-09-15 03:17:39.164900", "step": 1639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.195486", "step": 1639, "epoch": 3 }, { "type": "loss", "content": 0.00022367548081092536, "timestamp": "2025-09-15 03:17:39.219110", "step": 1640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:39.250174", "step": 1640, "epoch": 3 }, { "type": "loss", "content": 0.0002002513501793146, "timestamp": "2025-09-15 03:17:39.252287", "step": 1641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:39.283472", "step": 1641, "epoch": 3 }, { "type": "loss", "content": 5.098511974210851e-05, "timestamp": "2025-09-15 03:17:39.293213", "step": 1642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:39.323618", "step": 1642, "epoch": 3 }, { "type": "loss", "content": 0.0005090952618047595, "timestamp": "2025-09-15 03:17:39.325691", "step": 1643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:39.356005", "step": 1643, "epoch": 3 }, { "type": "loss", "content": 0.00013635992945637554, "timestamp": "2025-09-15 03:17:39.381004", "step": 1644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:39.412639", "step": 1644, "epoch": 3 }, { "type": "loss", "content": 0.0062268441542983055, "timestamp": "2025-09-15 03:17:39.417494", "step": 1645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:39.448202", "step": 1645, "epoch": 3 }, { "type": "loss", "content": 0.00019006979709956795, "timestamp": "2025-09-15 03:17:39.450407", "step": 1646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.480972", "step": 1646, "epoch": 3 }, { "type": "loss", "content": 5.067802339908667e-05, "timestamp": "2025-09-15 03:17:39.482899", "step": 1647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:39.514222", "step": 1647, "epoch": 3 }, { "type": "loss", "content": 0.00024163494526874274, "timestamp": "2025-09-15 03:17:39.542616", "step": 1648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:39.575048", "step": 1648, "epoch": 3 }, { "type": "loss", "content": 0.00035120677785016596, "timestamp": "2025-09-15 03:17:39.579842", "step": 1649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:39.611782", "step": 1649, "epoch": 3 }, { "type": "loss", "content": 5.8132307458436117e-05, "timestamp": "2025-09-15 03:17:39.613926", "step": 1650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:39.645586", "step": 1650, "epoch": 3 }, { "type": "loss", "content": 0.0001509405265096575, "timestamp": "2025-09-15 03:17:39.655709", "step": 1651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:39.686160", "step": 1651, "epoch": 3 }, { "type": "loss", "content": 3.116812513326295e-05, "timestamp": "2025-09-15 03:17:39.711410", "step": 1652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:39.741900", "step": 1652, "epoch": 3 }, { "type": "loss", "content": 9.823811706155539e-05, "timestamp": "2025-09-15 03:17:39.747089", "step": 1653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.777467", "step": 1653, "epoch": 3 }, { "type": "loss", "content": 2.2268699467531405e-05, "timestamp": "2025-09-15 03:17:39.779608", "step": 1654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.810971", "step": 1654, "epoch": 3 }, { "type": "loss", "content": 0.0004127257561776787, "timestamp": "2025-09-15 03:17:39.813080", "step": 1655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.843089", "step": 1655, "epoch": 3 }, { "type": "loss", "content": 0.00010350901720812544, "timestamp": "2025-09-15 03:17:39.866359", "step": 1656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:39.895946", "step": 1656, "epoch": 3 }, { "type": "loss", "content": 0.00023793868604116142, "timestamp": "2025-09-15 03:17:39.897910", "step": 1657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:39.928427", "step": 1657, "epoch": 3 }, { "type": "loss", "content": 3.0240338674047962e-05, "timestamp": "2025-09-15 03:17:39.930544", "step": 1658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:39.960872", "step": 1658, "epoch": 3 }, { "type": "loss", "content": 0.000498669920489192, "timestamp": "2025-09-15 03:17:39.963099", "step": 1659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:39.993800", "step": 1659, "epoch": 3 }, { "type": "loss", "content": 3.756927981157787e-05, "timestamp": "2025-09-15 03:17:40.019025", "step": 1660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:40.049851", "step": 1660, "epoch": 3 }, { "type": "loss", "content": 0.0003786278539337218, "timestamp": "2025-09-15 03:17:40.054126", "step": 1661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:40.084162", "step": 1661, "epoch": 3 }, { "type": "loss", "content": 0.0014668498188257217, "timestamp": "2025-09-15 03:17:40.086820", "step": 1662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:40.117725", "step": 1662, "epoch": 3 }, { "type": "loss", "content": 3.2887041015783325e-05, "timestamp": "2025-09-15 03:17:40.124820", "step": 1663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:40.155642", "step": 1663, "epoch": 3 }, { "type": "loss", "content": 0.0002688818785827607, "timestamp": "2025-09-15 03:17:40.180454", "step": 1664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 336 ], "flops": 9966940982208 }, "timestamp": "2025-09-15 03:17:40.211731", "step": 1664, "epoch": 3 }, { "type": "loss", "content": 0.00021670998830813915, "timestamp": "2025-09-15 03:17:40.224444", "step": 1665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:40.254860", "step": 1665, "epoch": 3 }, { "type": "loss", "content": 0.03699951618909836, "timestamp": "2025-09-15 03:17:40.256896", "step": 1666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:40.287111", "step": 1666, "epoch": 3 }, { "type": "loss", "content": 4.46015692432411e-05, "timestamp": "2025-09-15 03:17:40.289244", "step": 1667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:40.318654", "step": 1667, "epoch": 3 }, { "type": "loss", "content": 4.677437027567066e-05, "timestamp": "2025-09-15 03:17:40.341998", "step": 1668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:40.373612", "step": 1668, "epoch": 3 }, { "type": "loss", "content": 2.693878013815265e-05, "timestamp": "2025-09-15 03:17:40.378392", "step": 1669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:40.408649", "step": 1669, "epoch": 3 }, { "type": "loss", "content": 9.786576265469193e-05, "timestamp": "2025-09-15 03:17:40.410841", "step": 1670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:40.441928", "step": 1670, "epoch": 3 }, { "type": "loss", "content": 0.00011372385051799938, "timestamp": "2025-09-15 03:17:40.444060", "step": 1671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:40.476153", "step": 1671, "epoch": 3 }, { "type": "loss", "content": 4.8357556806877255e-05, "timestamp": "2025-09-15 03:17:40.501045", "step": 1672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:40.537066", "step": 1672, "epoch": 3 }, { "type": "loss", "content": 0.025152156129479408, "timestamp": "2025-09-15 03:17:40.539372", "step": 1673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:40.570462", "step": 1673, "epoch": 3 }, { "type": "loss", "content": 0.0001292003144044429, "timestamp": "2025-09-15 03:17:40.572601", "step": 1674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:40.602754", "step": 1674, "epoch": 3 }, { "type": "loss", "content": 7.244075823109597e-05, "timestamp": "2025-09-15 03:17:40.605069", "step": 1675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:40.635816", "step": 1675, "epoch": 3 }, { "type": "loss", "content": 6.385731103364378e-05, "timestamp": "2025-09-15 03:17:40.659405", "step": 1676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:40.690011", "step": 1676, "epoch": 3 }, { "type": "loss", "content": 4.8743659135652706e-05, "timestamp": "2025-09-15 03:17:40.694849", "step": 1677, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:41.281619", "step": 1677, "epoch": 3 }, { "type": "pplx", "content": 135761003.76394957, "timestamp": "2025-09-15 03:17:41.286975", "step": 1677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:41.320621", "step": 1677, "epoch": 3 }, { "type": "loss", "content": 4.129093940719031e-05, "timestamp": "2025-09-15 03:17:41.323529", "step": 1678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:41.354603", "step": 1678, "epoch": 3 }, { "type": "loss", "content": 9.938734729075804e-05, "timestamp": "2025-09-15 03:17:41.361197", "step": 1679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:41.391695", "step": 1679, "epoch": 3 }, { "type": "loss", "content": 0.0008803194505162537, "timestamp": "2025-09-15 03:17:41.416373", "step": 1680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:41.451837", "step": 1680, "epoch": 3 }, { "type": "loss", "content": 4.711677320301533e-05, "timestamp": "2025-09-15 03:17:41.458294", "step": 1681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:41.492780", "step": 1681, "epoch": 3 }, { "type": "loss", "content": 0.0001084799223463051, "timestamp": "2025-09-15 03:17:41.500333", "step": 1682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:41.531971", "step": 1682, "epoch": 3 }, { "type": "loss", "content": 4.469270788831636e-05, "timestamp": "2025-09-15 03:17:41.534657", "step": 1683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:41.565164", "step": 1683, "epoch": 3 }, { "type": "loss", "content": 0.00013767420023214072, "timestamp": "2025-09-15 03:17:41.593367", "step": 1684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:41.623730", "step": 1684, "epoch": 3 }, { "type": "loss", "content": 8.43398374854587e-05, "timestamp": "2025-09-15 03:17:41.625900", "step": 1685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:41.656635", "step": 1685, "epoch": 3 }, { "type": "loss", "content": 4.093054303666577e-05, "timestamp": "2025-09-15 03:17:41.659183", "step": 1686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:41.689677", "step": 1686, "epoch": 3 }, { "type": "loss", "content": 6.223317177500576e-05, "timestamp": "2025-09-15 03:17:41.696270", "step": 1687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:41.726827", "step": 1687, "epoch": 3 }, { "type": "loss", "content": 7.752107921987772e-05, "timestamp": "2025-09-15 03:17:41.750215", "step": 1688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:41.781636", "step": 1688, "epoch": 3 }, { "type": "loss", "content": 0.0001287296909140423, "timestamp": "2025-09-15 03:17:41.786424", "step": 1689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:41.816332", "step": 1689, "epoch": 3 }, { "type": "loss", "content": 6.89863009029068e-05, "timestamp": "2025-09-15 03:17:41.818560", "step": 1690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:41.848871", "step": 1690, "epoch": 3 }, { "type": "loss", "content": 9.750080789672211e-05, "timestamp": "2025-09-15 03:17:41.851272", "step": 1691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:41.881635", "step": 1691, "epoch": 3 }, { "type": "loss", "content": 3.922905307263136e-05, "timestamp": "2025-09-15 03:17:41.905383", "step": 1692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:41.936710", "step": 1692, "epoch": 3 }, { "type": "loss", "content": 5.144997703609988e-05, "timestamp": "2025-09-15 03:17:41.938888", "step": 1693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:41.969565", "step": 1693, "epoch": 3 }, { "type": "loss", "content": 2.403518919891212e-05, "timestamp": "2025-09-15 03:17:41.971654", "step": 1694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:42.001595", "step": 1694, "epoch": 3 }, { "type": "loss", "content": 0.0002912590862251818, "timestamp": "2025-09-15 03:17:42.003765", "step": 1695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:42.034418", "step": 1695, "epoch": 3 }, { "type": "loss", "content": 0.00025712628848850727, "timestamp": "2025-09-15 03:17:42.059188", "step": 1696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:42.089426", "step": 1696, "epoch": 3 }, { "type": "loss", "content": 7.03453624737449e-05, "timestamp": "2025-09-15 03:17:42.091097", "step": 1697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:42.121163", "step": 1697, "epoch": 3 }, { "type": "loss", "content": 6.731459870934486e-05, "timestamp": "2025-09-15 03:17:42.123087", "step": 1698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:42.153182", "step": 1698, "epoch": 3 }, { "type": "loss", "content": 0.0017748010577633977, "timestamp": "2025-09-15 03:17:42.157329", "step": 1699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:42.188002", "step": 1699, "epoch": 3 }, { "type": "loss", "content": 6.168592517497018e-05, "timestamp": "2025-09-15 03:17:42.211696", "step": 1700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:42.241704", "step": 1700, "epoch": 3 }, { "type": "loss", "content": 0.0001511227892478928, "timestamp": "2025-09-15 03:17:42.243621", "step": 1701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:42.273960", "step": 1701, "epoch": 3 }, { "type": "loss", "content": 0.0003747472946997732, "timestamp": "2025-09-15 03:17:42.280554", "step": 1702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:42.311650", "step": 1702, "epoch": 3 }, { "type": "loss", "content": 0.00010352622484788299, "timestamp": "2025-09-15 03:17:42.321099", "step": 1703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:42.353220", "step": 1703, "epoch": 3 }, { "type": "loss", "content": 0.0008733622962608933, "timestamp": "2025-09-15 03:17:42.380767", "step": 1704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:42.411800", "step": 1704, "epoch": 3 }, { "type": "loss", "content": 0.00016946492542047054, "timestamp": "2025-09-15 03:17:42.415286", "step": 1705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:42.445197", "step": 1705, "epoch": 3 }, { "type": "loss", "content": 0.00017234814004041255, "timestamp": "2025-09-15 03:17:42.447671", "step": 1706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:42.479684", "step": 1706, "epoch": 3 }, { "type": "loss", "content": 6.766520527889952e-05, "timestamp": "2025-09-15 03:17:42.481953", "step": 1707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:42.513059", "step": 1707, "epoch": 3 }, { "type": "loss", "content": 0.000172713422216475, "timestamp": "2025-09-15 03:17:42.538190", "step": 1708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:42.568272", "step": 1708, "epoch": 3 }, { "type": "loss", "content": 0.0005255985306575894, "timestamp": "2025-09-15 03:17:42.570532", "step": 1709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:42.601059", "step": 1709, "epoch": 3 }, { "type": "loss", "content": 5.218220758251846e-05, "timestamp": "2025-09-15 03:17:42.603193", "step": 1710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:42.633773", "step": 1710, "epoch": 3 }, { "type": "loss", "content": 0.00021290150471031666, "timestamp": "2025-09-15 03:17:42.637784", "step": 1711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:42.668196", "step": 1711, "epoch": 3 }, { "type": "loss", "content": 0.0010456795571371913, "timestamp": "2025-09-15 03:17:42.692181", "step": 1712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:42.722741", "step": 1712, "epoch": 3 }, { "type": "loss", "content": 7.15435016900301e-05, "timestamp": "2025-09-15 03:17:42.725108", "step": 1713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:42.757247", "step": 1713, "epoch": 3 }, { "type": "loss", "content": 0.0016514122253283858, "timestamp": "2025-09-15 03:17:42.760580", "step": 1714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:42.791853", "step": 1714, "epoch": 3 }, { "type": "loss", "content": 6.494522676803172e-05, "timestamp": "2025-09-15 03:17:42.794005", "step": 1715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:42.825168", "step": 1715, "epoch": 3 }, { "type": "loss", "content": 5.324947051121853e-05, "timestamp": "2025-09-15 03:17:42.848715", "step": 1716, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:43.438548", "step": 1716, "epoch": 3 }, { "type": "pplx", "content": 138965352.20121047, "timestamp": "2025-09-15 03:17:43.440687", "step": 1716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:43.469520", "step": 1716, "epoch": 3 }, { "type": "loss", "content": 6.845069583505392e-05, "timestamp": "2025-09-15 03:17:43.471651", "step": 1717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:43.502230", "step": 1717, "epoch": 3 }, { "type": "loss", "content": 3.28231108142063e-05, "timestamp": "2025-09-15 03:17:43.506566", "step": 1718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:43.537483", "step": 1718, "epoch": 3 }, { "type": "loss", "content": 0.0023003576789051294, "timestamp": "2025-09-15 03:17:43.544153", "step": 1719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:43.574979", "step": 1719, "epoch": 3 }, { "type": "loss", "content": 0.00016819333541207016, "timestamp": "2025-09-15 03:17:43.600063", "step": 1720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:43.636281", "step": 1720, "epoch": 3 }, { "type": "loss", "content": 7.036477472865954e-05, "timestamp": "2025-09-15 03:17:43.638425", "step": 1721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:43.670225", "step": 1721, "epoch": 3 }, { "type": "loss", "content": 7.370777893811464e-05, "timestamp": "2025-09-15 03:17:43.676826", "step": 1722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:43.707432", "step": 1722, "epoch": 3 }, { "type": "loss", "content": 0.00018600482144393027, "timestamp": "2025-09-15 03:17:43.709773", "step": 1723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:43.740517", "step": 1723, "epoch": 3 }, { "type": "loss", "content": 0.000139830241096206, "timestamp": "2025-09-15 03:17:43.764287", "step": 1724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:43.795428", "step": 1724, "epoch": 3 }, { "type": "loss", "content": 0.029730452224612236, "timestamp": "2025-09-15 03:17:43.797441", "step": 1725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:43.827379", "step": 1725, "epoch": 3 }, { "type": "loss", "content": 0.00035227558691985905, "timestamp": "2025-09-15 03:17:43.829398", "step": 1726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:43.859975", "step": 1726, "epoch": 3 }, { "type": "loss", "content": 0.00037362094735726714, "timestamp": "2025-09-15 03:17:43.862117", "step": 1727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:43.894698", "step": 1727, "epoch": 3 }, { "type": "loss", "content": 8.00935085862875e-05, "timestamp": "2025-09-15 03:17:43.919587", "step": 1728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:43.952324", "step": 1728, "epoch": 3 }, { "type": "loss", "content": 0.0001457637408748269, "timestamp": "2025-09-15 03:17:43.954398", "step": 1729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:43.985154", "step": 1729, "epoch": 3 }, { "type": "loss", "content": 0.00011651766544673592, "timestamp": "2025-09-15 03:17:43.987336", "step": 1730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 272 ], "flops": 8068526078144 }, "timestamp": "2025-09-15 03:17:44.017947", "step": 1730, "epoch": 3 }, { "type": "loss", "content": 0.00023613232770003378, "timestamp": "2025-09-15 03:17:44.027936", "step": 1731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:44.058871", "step": 1731, "epoch": 3 }, { "type": "loss", "content": 4.856818122789264e-05, "timestamp": "2025-09-15 03:17:44.082395", "step": 1732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:44.113761", "step": 1732, "epoch": 3 }, { "type": "loss", "content": 3.342150739626959e-05, "timestamp": "2025-09-15 03:17:44.115756", "step": 1733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.146674", "step": 1733, "epoch": 3 }, { "type": "loss", "content": 0.0007277569966390729, "timestamp": "2025-09-15 03:17:44.148838", "step": 1734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:44.179897", "step": 1734, "epoch": 3 }, { "type": "loss", "content": 2.9094177079969086e-05, "timestamp": "2025-09-15 03:17:44.181940", "step": 1735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:44.213149", "step": 1735, "epoch": 3 }, { "type": "loss", "content": 0.00013098781346343458, "timestamp": "2025-09-15 03:17:44.237940", "step": 1736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.268487", "step": 1736, "epoch": 3 }, { "type": "loss", "content": 0.0003191656433045864, "timestamp": "2025-09-15 03:17:44.271011", "step": 1737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.301703", "step": 1737, "epoch": 3 }, { "type": "loss", "content": 0.00011755731975426897, "timestamp": "2025-09-15 03:17:44.304014", "step": 1738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:44.338186", "step": 1738, "epoch": 3 }, { "type": "loss", "content": 6.14327727816999e-05, "timestamp": "2025-09-15 03:17:44.345775", "step": 1739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:44.377737", "step": 1739, "epoch": 3 }, { "type": "loss", "content": 8.636755228508264e-05, "timestamp": "2025-09-15 03:17:44.402384", "step": 1740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:44.433147", "step": 1740, "epoch": 3 }, { "type": "loss", "content": 0.000406697450671345, "timestamp": "2025-09-15 03:17:44.435279", "step": 1741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:44.465395", "step": 1741, "epoch": 3 }, { "type": "loss", "content": 8.220264862757176e-05, "timestamp": "2025-09-15 03:17:44.469820", "step": 1742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:44.500540", "step": 1742, "epoch": 3 }, { "type": "loss", "content": 0.0001976119092432782, "timestamp": "2025-09-15 03:17:44.503207", "step": 1743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:44.533651", "step": 1743, "epoch": 3 }, { "type": "loss", "content": 0.00012376924860291183, "timestamp": "2025-09-15 03:17:44.562249", "step": 1744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.591645", "step": 1744, "epoch": 3 }, { "type": "loss", "content": 0.00025345777976326644, "timestamp": "2025-09-15 03:17:44.593899", "step": 1745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:44.624985", "step": 1745, "epoch": 3 }, { "type": "loss", "content": 0.014335216954350471, "timestamp": "2025-09-15 03:17:44.631391", "step": 1746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.662438", "step": 1746, "epoch": 3 }, { "type": "loss", "content": 0.006056909915059805, "timestamp": "2025-09-15 03:17:44.664566", "step": 1747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:44.695551", "step": 1747, "epoch": 3 }, { "type": "loss", "content": 0.00012910777877550572, "timestamp": "2025-09-15 03:17:44.719509", "step": 1748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 304 ], "flops": 9017733530176 }, "timestamp": "2025-09-15 03:17:44.750316", "step": 1748, "epoch": 3 }, { "type": "loss", "content": 0.00021627625392284244, "timestamp": "2025-09-15 03:17:44.759954", "step": 1749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:44.791299", "step": 1749, "epoch": 3 }, { "type": "loss", "content": 4.3783224100479856e-05, "timestamp": "2025-09-15 03:17:44.795272", "step": 1750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:44.826847", "step": 1750, "epoch": 3 }, { "type": "loss", "content": 0.0003299799282103777, "timestamp": "2025-09-15 03:17:44.830789", "step": 1751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:44.862992", "step": 1751, "epoch": 3 }, { "type": "loss", "content": 0.00012049104407196864, "timestamp": "2025-09-15 03:17:44.888120", "step": 1752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:44.919159", "step": 1752, "epoch": 3 }, { "type": "loss", "content": 0.00011262983025517315, "timestamp": "2025-09-15 03:17:44.924025", "step": 1753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.955482", "step": 1753, "epoch": 3 }, { "type": "loss", "content": 9.659709030529484e-05, "timestamp": "2025-09-15 03:17:44.957823", "step": 1754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:44.990112", "step": 1754, "epoch": 3 }, { "type": "loss", "content": 0.0011848767753690481, "timestamp": "2025-09-15 03:17:44.994887", "step": 1755, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:45.595571", "step": 1755, "epoch": 3 }, { "type": "pplx", "content": 140086331.41677055, "timestamp": "2025-09-15 03:17:45.597889", "step": 1755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:45.626885", "step": 1755, "epoch": 3 }, { "type": "loss", "content": 5.509558832272887e-05, "timestamp": "2025-09-15 03:17:45.654311", "step": 1756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:45.687158", "step": 1756, "epoch": 3 }, { "type": "loss", "content": 0.0002119831769960001, "timestamp": "2025-09-15 03:17:45.689524", "step": 1757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:45.720028", "step": 1757, "epoch": 3 }, { "type": "loss", "content": 0.0004407106607686728, "timestamp": "2025-09-15 03:17:45.724456", "step": 1758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:45.754924", "step": 1758, "epoch": 3 }, { "type": "loss", "content": 0.00029242291930131614, "timestamp": "2025-09-15 03:17:45.762539", "step": 1759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:45.793873", "step": 1759, "epoch": 3 }, { "type": "loss", "content": 0.00021790176106151193, "timestamp": "2025-09-15 03:17:45.818803", "step": 1760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:45.849546", "step": 1760, "epoch": 3 }, { "type": "loss", "content": 0.00012010778300464153, "timestamp": "2025-09-15 03:17:45.853836", "step": 1761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 80 ], "flops": 2373281365952 }, "timestamp": "2025-09-15 03:17:45.884420", "step": 1761, "epoch": 3 }, { "type": "loss", "content": 0.00024840538389980793, "timestamp": "2025-09-15 03:17:45.886840", "step": 1762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:45.917588", "step": 1762, "epoch": 3 }, { "type": "loss", "content": 0.00018551461107563227, "timestamp": "2025-09-15 03:17:45.919864", "step": 1763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:45.950975", "step": 1763, "epoch": 3 }, { "type": "loss", "content": 0.0006247073761187494, "timestamp": "2025-09-15 03:17:45.978512", "step": 1764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.009176", "step": 1764, "epoch": 3 }, { "type": "loss", "content": 9.605469676898792e-05, "timestamp": "2025-09-15 03:17:46.011359", "step": 1765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:46.041856", "step": 1765, "epoch": 3 }, { "type": "loss", "content": 0.00016489412519149482, "timestamp": "2025-09-15 03:17:46.044002", "step": 1766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:46.075531", "step": 1766, "epoch": 3 }, { "type": "loss", "content": 8.419169898843393e-05, "timestamp": "2025-09-15 03:17:46.077708", "step": 1767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:46.108142", "step": 1767, "epoch": 3 }, { "type": "loss", "content": 0.0018693411257117987, "timestamp": "2025-09-15 03:17:46.131586", "step": 1768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:46.167000", "step": 1768, "epoch": 3 }, { "type": "loss", "content": 0.0003774032520595938, "timestamp": "2025-09-15 03:17:46.169305", "step": 1769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:46.200748", "step": 1769, "epoch": 3 }, { "type": "loss", "content": 0.02417862042784691, "timestamp": "2025-09-15 03:17:46.207098", "step": 1770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:46.237856", "step": 1770, "epoch": 3 }, { "type": "loss", "content": 5.752419019700028e-05, "timestamp": "2025-09-15 03:17:46.240370", "step": 1771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:46.271758", "step": 1771, "epoch": 3 }, { "type": "loss", "content": 9.621244680602103e-05, "timestamp": "2025-09-15 03:17:46.296919", "step": 1772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:46.328942", "step": 1772, "epoch": 3 }, { "type": "loss", "content": 6.722491525579244e-05, "timestamp": "2025-09-15 03:17:46.333281", "step": 1773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:46.364030", "step": 1773, "epoch": 3 }, { "type": "loss", "content": 0.00020308203238528222, "timestamp": "2025-09-15 03:17:46.370538", "step": 1774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:46.401083", "step": 1774, "epoch": 3 }, { "type": "loss", "content": 0.000121831770229619, "timestamp": "2025-09-15 03:17:46.403224", "step": 1775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:46.433251", "step": 1775, "epoch": 3 }, { "type": "loss", "content": 3.221361475880258e-05, "timestamp": "2025-09-15 03:17:46.456832", "step": 1776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:46.487454", "step": 1776, "epoch": 3 }, { "type": "loss", "content": 8.978391997516155e-05, "timestamp": "2025-09-15 03:17:46.489916", "step": 1777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:46.520135", "step": 1777, "epoch": 3 }, { "type": "loss", "content": 3.9384856791002676e-05, "timestamp": "2025-09-15 03:17:46.522192", "step": 1778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:46.552442", "step": 1778, "epoch": 3 }, { "type": "loss", "content": 0.0006986562511883676, "timestamp": "2025-09-15 03:17:46.554741", "step": 1779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.585535", "step": 1779, "epoch": 3 }, { "type": "loss", "content": 3.0208628231775947e-05, "timestamp": "2025-09-15 03:17:46.610407", "step": 1780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:46.640734", "step": 1780, "epoch": 3 }, { "type": "loss", "content": 0.00015801134577486664, "timestamp": "2025-09-15 03:17:46.642717", "step": 1781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.673852", "step": 1781, "epoch": 3 }, { "type": "loss", "content": 7.350670784944668e-05, "timestamp": "2025-09-15 03:17:46.677442", "step": 1782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:46.708425", "step": 1782, "epoch": 3 }, { "type": "loss", "content": 1.752012576616835e-05, "timestamp": "2025-09-15 03:17:46.710685", "step": 1783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.740955", "step": 1783, "epoch": 3 }, { "type": "loss", "content": 0.00017398015188518912, "timestamp": "2025-09-15 03:17:46.765667", "step": 1784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:46.796491", "step": 1784, "epoch": 3 }, { "type": "loss", "content": 0.00010306698823114857, "timestamp": "2025-09-15 03:17:46.800677", "step": 1785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.831725", "step": 1785, "epoch": 3 }, { "type": "loss", "content": 6.714624032611027e-05, "timestamp": "2025-09-15 03:17:46.835119", "step": 1786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:46.866345", "step": 1786, "epoch": 3 }, { "type": "loss", "content": 0.0005150840734131634, "timestamp": "2025-09-15 03:17:46.872547", "step": 1787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:46.905436", "step": 1787, "epoch": 3 }, { "type": "loss", "content": 0.0004283357411623001, "timestamp": "2025-09-15 03:17:46.929174", "step": 1788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:46.960322", "step": 1788, "epoch": 3 }, { "type": "loss", "content": 0.0003343620337545872, "timestamp": "2025-09-15 03:17:46.962422", "step": 1789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:46.999647", "step": 1789, "epoch": 3 }, { "type": "loss", "content": 0.000761338509619236, "timestamp": "2025-09-15 03:17:47.001884", "step": 1790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:47.037223", "step": 1790, "epoch": 3 }, { "type": "loss", "content": 8.442411490250379e-05, "timestamp": "2025-09-15 03:17:47.039198", "step": 1791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:47.069365", "step": 1791, "epoch": 3 }, { "type": "loss", "content": 0.0005162957822903991, "timestamp": "2025-09-15 03:17:47.094420", "step": 1792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:47.126246", "step": 1792, "epoch": 3 }, { "type": "loss", "content": 0.000750789069570601, "timestamp": "2025-09-15 03:17:47.130381", "step": 1793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:47.161590", "step": 1793, "epoch": 3 }, { "type": "loss", "content": 0.0051539321430027485, "timestamp": "2025-09-15 03:17:47.163674", "step": 1794, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:47.747823", "step": 1794, "epoch": 3 }, { "type": "pplx", "content": 139876290.7306363, "timestamp": "2025-09-15 03:17:47.750267", "step": 1794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:47.780000", "step": 1794, "epoch": 3 }, { "type": "loss", "content": 5.731660712626763e-05, "timestamp": "2025-09-15 03:17:47.783328", "step": 1795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:47.814510", "step": 1795, "epoch": 3 }, { "type": "loss", "content": 6.324555579340085e-05, "timestamp": "2025-09-15 03:17:47.838194", "step": 1796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:47.868953", "step": 1796, "epoch": 3 }, { "type": "loss", "content": 0.00020690243400167674, "timestamp": "2025-09-15 03:17:47.870933", "step": 1797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:47.901019", "step": 1797, "epoch": 3 }, { "type": "loss", "content": 8.057226659730077e-05, "timestamp": "2025-09-15 03:17:47.907824", "step": 1798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:47.937934", "step": 1798, "epoch": 3 }, { "type": "loss", "content": 5.081558629171923e-05, "timestamp": "2025-09-15 03:17:47.940073", "step": 1799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:47.970420", "step": 1799, "epoch": 3 }, { "type": "loss", "content": 0.000641024496871978, "timestamp": "2025-09-15 03:17:47.995209", "step": 1800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:48.025733", "step": 1800, "epoch": 3 }, { "type": "loss", "content": 7.569055742351338e-05, "timestamp": "2025-09-15 03:17:48.027799", "step": 1801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:48.058529", "step": 1801, "epoch": 3 }, { "type": "loss", "content": 3.237251439713873e-05, "timestamp": "2025-09-15 03:17:48.062471", "step": 1802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:48.092147", "step": 1802, "epoch": 3 }, { "type": "loss", "content": 0.0009622335783205926, "timestamp": "2025-09-15 03:17:48.096558", "step": 1803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:48.127561", "step": 1803, "epoch": 3 }, { "type": "loss", "content": 0.00022668966266792268, "timestamp": "2025-09-15 03:17:48.152553", "step": 1804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:48.183191", "step": 1804, "epoch": 3 }, { "type": "loss", "content": 0.0006432764348573983, "timestamp": "2025-09-15 03:17:48.185065", "step": 1805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:48.217108", "step": 1805, "epoch": 3 }, { "type": "loss", "content": 0.0002396961353952065, "timestamp": "2025-09-15 03:17:48.220989", "step": 1806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:48.252007", "step": 1806, "epoch": 3 }, { "type": "loss", "content": 2.7812382541014813e-05, "timestamp": "2025-09-15 03:17:48.258314", "step": 1807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:48.288829", "step": 1807, "epoch": 3 }, { "type": "loss", "content": 0.028440991416573524, "timestamp": "2025-09-15 03:17:48.316365", "step": 1808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:48.347006", "step": 1808, "epoch": 3 }, { "type": "loss", "content": 5.9055400924989954e-05, "timestamp": "2025-09-15 03:17:48.349409", "step": 1809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:48.379531", "step": 1809, "epoch": 3 }, { "type": "loss", "content": 6.185309030115604e-05, "timestamp": "2025-09-15 03:17:48.383699", "step": 1810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:48.414706", "step": 1810, "epoch": 3 }, { "type": "loss", "content": 0.00011187683412572369, "timestamp": "2025-09-15 03:17:48.421412", "step": 1811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:48.452106", "step": 1811, "epoch": 3 }, { "type": "loss", "content": 0.002612639917060733, "timestamp": "2025-09-15 03:17:48.475812", "step": 1812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:48.506139", "step": 1812, "epoch": 3 }, { "type": "loss", "content": 0.00084578653331846, "timestamp": "2025-09-15 03:17:48.508136", "step": 1813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:48.538928", "step": 1813, "epoch": 3 }, { "type": "loss", "content": 0.00016805656196083874, "timestamp": "2025-09-15 03:17:48.545362", "step": 1814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:48.576927", "step": 1814, "epoch": 3 }, { "type": "loss", "content": 0.0002695379371289164, "timestamp": "2025-09-15 03:17:48.579339", "step": 1815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:48.609711", "step": 1815, "epoch": 3 }, { "type": "loss", "content": 0.0001330919039901346, "timestamp": "2025-09-15 03:17:48.633494", "step": 1816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:48.664532", "step": 1816, "epoch": 3 }, { "type": "loss", "content": 3.694388215080835e-05, "timestamp": "2025-09-15 03:17:48.668735", "step": 1817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:48.699747", "step": 1817, "epoch": 3 }, { "type": "loss", "content": 8.705775690032169e-05, "timestamp": "2025-09-15 03:17:48.703552", "step": 1818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:48.734292", "step": 1818, "epoch": 3 }, { "type": "loss", "content": 3.831345748039894e-05, "timestamp": "2025-09-15 03:17:48.736500", "step": 1819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:48.768337", "step": 1819, "epoch": 3 }, { "type": "loss", "content": 6.43322491669096e-05, "timestamp": "2025-09-15 03:17:48.795922", "step": 1820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:48.827213", "step": 1820, "epoch": 3 }, { "type": "loss", "content": 0.00010557322093518451, "timestamp": "2025-09-15 03:17:48.831882", "step": 1821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:48.862576", "step": 1821, "epoch": 3 }, { "type": "loss", "content": 3.788010144489817e-05, "timestamp": "2025-09-15 03:17:48.865094", "step": 1822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:48.895413", "step": 1822, "epoch": 3 }, { "type": "loss", "content": 6.223905802471563e-05, "timestamp": "2025-09-15 03:17:48.898615", "step": 1823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:48.929419", "step": 1823, "epoch": 3 }, { "type": "loss", "content": 0.0003868688945658505, "timestamp": "2025-09-15 03:17:48.954107", "step": 1824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:48.985096", "step": 1824, "epoch": 3 }, { "type": "loss", "content": 0.00013783354370389134, "timestamp": "2025-09-15 03:17:48.989397", "step": 1825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:49.020157", "step": 1825, "epoch": 3 }, { "type": "loss", "content": 2.5953302611014806e-05, "timestamp": "2025-09-15 03:17:49.023940", "step": 1826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:49.054521", "step": 1826, "epoch": 3 }, { "type": "loss", "content": 3.0394159693969414e-05, "timestamp": "2025-09-15 03:17:49.058746", "step": 1827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 240 ], "flops": 7119318626112 }, "timestamp": "2025-09-15 03:17:49.089242", "step": 1827, "epoch": 3 }, { "type": "loss", "content": 7.785171328578144e-05, "timestamp": "2025-09-15 03:17:49.117503", "step": 1828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:49.148405", "step": 1828, "epoch": 3 }, { "type": "loss", "content": 0.0018183441134169698, "timestamp": "2025-09-15 03:17:49.150989", "step": 1829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:49.182348", "step": 1829, "epoch": 3 }, { "type": "loss", "content": 6.903747998876497e-05, "timestamp": "2025-09-15 03:17:49.186236", "step": 1830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:49.216811", "step": 1830, "epoch": 3 }, { "type": "loss", "content": 0.00015899473510216922, "timestamp": "2025-09-15 03:17:49.218876", "step": 1831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:49.249546", "step": 1831, "epoch": 3 }, { "type": "loss", "content": 0.001722590415738523, "timestamp": "2025-09-15 03:17:49.273439", "step": 1832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:49.303242", "step": 1832, "epoch": 3 }, { "type": "loss", "content": 8.198097202694044e-05, "timestamp": "2025-09-15 03:17:49.305692", "step": 1833, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:49.889493", "step": 1833, "epoch": 3 }, { "type": "pplx", "content": 145811986.31162205, "timestamp": "2025-09-15 03:17:49.891391", "step": 1833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:49.920674", "step": 1833, "epoch": 3 }, { "type": "loss", "content": 3.944264608435333e-05, "timestamp": "2025-09-15 03:17:49.923196", "step": 1834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:49.953753", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.0001038339250953868, "timestamp": "2025-09-15 03:17:49.957586", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:49.987788", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 3.5049022699240595e-05, "timestamp": "2025-09-15 03:17:50.011463", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:50.042853", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 4.700681893154979e-05, "timestamp": "2025-09-15 03:17:50.045000", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:50.076048", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 3.058017682633363e-05, "timestamp": "2025-09-15 03:17:50.078544", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:50.109159", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 2.1085737898829393e-05, "timestamp": "2025-09-15 03:17:50.112888", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:50.143781", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 9.610828419681638e-05, "timestamp": "2025-09-15 03:17:50.167469", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:50.197108", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.004703332204371691, "timestamp": "2025-09-15 03:17:50.199418", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:50.229821", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.0007846274529583752, "timestamp": "2025-09-15 03:17:50.232116", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:50.262965", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 9.324204438598827e-05, "timestamp": "2025-09-15 03:17:50.264999", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:50.294921", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 2.0824878447456285e-05, "timestamp": "2025-09-15 03:17:50.318448", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:50.351452", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 2.5175850169034675e-05, "timestamp": "2025-09-15 03:17:50.353872", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 256 ], "flops": 7593922352128 }, "timestamp": "2025-09-15 03:17:50.385865", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 2.5445378923905082e-05, "timestamp": "2025-09-15 03:17:50.393352", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:50.424171", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.00018293499306309968, "timestamp": "2025-09-15 03:17:50.426359", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:50.457724", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.0002315417950740084, "timestamp": "2025-09-15 03:17:50.485210", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:50.515903", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 2.1575300706899725e-05, "timestamp": "2025-09-15 03:17:50.518880", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:50.549547", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 7.31429536244832e-05, "timestamp": "2025-09-15 03:17:50.553778", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:50.584547", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.00030331689049489796, "timestamp": "2025-09-15 03:17:50.586652", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:50.617072", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.0001617541565792635, "timestamp": "2025-09-15 03:17:50.641767", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:17:50.671890", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.007455338723957539, "timestamp": "2025-09-15 03:17:50.673950", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:50.704358", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 0.00015413833898492157, "timestamp": "2025-09-15 03:17:50.706491", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:50.737855", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 8.115199307212606e-05, "timestamp": "2025-09-15 03:17:50.742300", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:17:50.772640", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 7.526958506787196e-05, "timestamp": "2025-09-15 03:17:50.796207", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:50.828150", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.00012373061326798052, "timestamp": "2025-09-15 03:17:50.833114", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:50.865702", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 9.806371963350102e-05, "timestamp": "2025-09-15 03:17:50.872254", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:50.904947", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 2.61258701357292e-05, "timestamp": "2025-09-15 03:17:50.911649", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 96 ], "flops": 2847885091968 }, "timestamp": "2025-09-15 03:17:50.942527", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 8.361357322428375e-05, "timestamp": "2025-09-15 03:17:50.966278", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:50.997586", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.00019799031724687666, "timestamp": "2025-09-15 03:17:51.002567", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:17:51.035498", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.00023388156841974705, "timestamp": "2025-09-15 03:17:51.040216", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 224 ], "flops": 6644714900096 }, "timestamp": "2025-09-15 03:17:51.073505", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.00021596389706246555, "timestamp": "2025-09-15 03:17:51.080234", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:17:51.110559", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.044805027544498444, "timestamp": "2025-09-15 03:17:51.134105", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 192 ], "flops": 5695507448064 }, "timestamp": "2025-09-15 03:17:51.164511", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.05255137011408806, "timestamp": "2025-09-15 03:17:51.166617", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:51.197226", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 6.220581417437643e-05, "timestamp": "2025-09-15 03:17:51.201180", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:17:51.231987", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 0.0002913509670179337, "timestamp": "2025-09-15 03:17:51.235894", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 208 ], "flops": 6170111174080 }, "timestamp": "2025-09-15 03:17:51.266330", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 0.00011638918658718467, "timestamp": "2025-09-15 03:17:51.293716", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 2, 192 ], "flops": 2847885110400 }, "timestamp": "2025-09-15 03:17:51.324458", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 3.343486241647042e-05, "timestamp": "2025-09-15 03:17:51.327696", "step": 1869, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 224 ], "batch_size": 8, "flops": 4429610391296 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 64 ], "batch_size": 8, "flops": 1265603017216 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 160 ], "batch_size": 8, "flops": 3164007441664 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 8, 208 ], "batch_size": 8, "flops": 4113209653888 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 176 ], "batch_size": 8, "flops": 3480408179072 }, { "type": "perplexity", "in_batch_dim": [ 8, 256 ], "batch_size": 8, "flops": 5062411866112 }, { "type": "perplexity", "in_batch_dim": [ 8, 128 ], "batch_size": 8, "flops": 2531205966848 }, { "type": "perplexity", "in_batch_dim": [ 8, 144 ], "batch_size": 8, "flops": 2847606704256 }, { "type": "perplexity", "in_batch_dim": [ 8, 192 ], "batch_size": 8, "flops": 3796808916480 }, { "type": "perplexity", "in_batch_dim": [ 5, 144 ], "batch_size": 8, "flops": 2847606704256 } ], "timestamp": "2025-09-15 03:17:51.914734", "step": 1869, "epoch": 3 }, { "type": "pplx", "content": 146250701.10244825, "timestamp": "2025-09-15 03:17:51.916520", "step": 1869, "epoch": 3 }, { "type": "best_pplx", "content": 52920853.06323647, "timestamp": "2025-09-15 03:17:51.917998", "step": 1869, "epoch": 3 }, { "type": "best_step", "content": 39, "timestamp": "2025-09-15 03:17:51.919338", "step": 1869, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5333250945655808, "timestamp": "2025-09-15 03:17:51.920707", "step": 1869, "epoch": 3 }, { "type": "total_train_flops", "content": 9219668431260864, "timestamp": "2025-09-15 03:17:51.922367", "step": 1869, "epoch": 3 } ], "best_evals": { "pplx": { "score": 52920853.06323647, "step": 39 }, "rougel": { "precision": 0.8339350180505415, "recall": 0.8339350180505415, "fmeasure": 0.8339350180505415 } } }