{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlg_rocstories_title_answer_generation_lora_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlg_rocstories_title_answer_generation_lora_v1/runs/Sep02_14-10-49_gx12", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 40, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 20, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlg_rocstories_title_answer_generation_lora_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": { "task_type": "CAUSAL_LM", "peft_type": "LORA", "auto_mapping": null, "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", "revision": null, "inference_mode": false, "r": 16, "target_modules": [ "o_proj", "gate_proj", "down_proj", "up_proj", "v_proj", "q_proj", "k_proj" ], "exclude_modules": null, "lora_alpha": 16, "lora_dropout": 0.1, "fan_in_fan_out": false, "bias": "none", "use_rslora": true, "modules_to_save": null, "init_lora_weights": true, "layers_to_transform": null, "layers_pattern": null, "rank_pattern": {}, "alpha_pattern": {}, "megatron_config": null, "megatron_core": "megatron.core", "trainable_token_indices": null, "loftq_config": {}, "eva_config": null, "corda_config": null, "use_dora": false, "layer_replication": null, "runtime_config": { "ephemeral_gpu_offload": false }, "lora_bias": false }, "flops": { "eval": 31522133944471680, "train": 1.00080610181424e+16, "total": 4.153019496261408e+16 }, "total_energy": 13.80715, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:10:58.230226", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 15377.383313891298, "timestamp": "2025-09-02 14:10:58.232736", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.334062", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.6810653805732727, "timestamp": "2025-09-02 14:10:58.336651", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:10:58.407099", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.6935662627220154, "timestamp": "2025-09-02 14:10:58.409495", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:10:58.463463", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.6932523250579834, "timestamp": "2025-09-02 14:10:58.465845", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.527248", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.7479343414306641, "timestamp": "2025-09-02 14:10:58.573475", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:10:58.628575", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.41945281624794006, "timestamp": "2025-09-02 14:10:58.630871", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.691116", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.5741308331489563, "timestamp": "2025-09-02 14:10:58.694510", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.746626", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.6775122284889221, "timestamp": "2025-09-02 14:10:58.749120", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.801492", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.46684426069259644, "timestamp": "2025-09-02 14:10:58.807578", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.859194", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.45560818910598755, "timestamp": "2025-09-02 14:10:58.861221", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:10:58.930915", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.5112396478652954, "timestamp": "2025-09-02 14:10:58.933016", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:58.985657", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.5605266094207764, "timestamp": "2025-09-02 14:10:58.989073", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:10:59.043038", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.3811867833137512, "timestamp": "2025-09-02 14:10:59.049216", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.101036", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.4144041836261749, "timestamp": "2025-09-02 14:10:59.103221", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.155675", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.43283385038375854, "timestamp": "2025-09-02 14:10:59.157929", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.210478", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.4814932644367218, "timestamp": "2025-09-02 14:10:59.212575", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:10:59.264874", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.4341704547405243, "timestamp": "2025-09-02 14:10:59.271056", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.323522", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.2765498459339142, "timestamp": "2025-09-02 14:10:59.325952", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:10:59.378902", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.24207574129104614, "timestamp": "2025-09-02 14:10:59.381183", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.434089", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.5494191646575928, "timestamp": "2025-09-02 14:10:59.436186", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:10:59.488569", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.38677507638931274, "timestamp": "2025-09-02 14:10:59.494397", "step": 20, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:01.342231", "step": 20, "epoch": 1 }, { "type": "pplx", "content": 13503.96050699745, "timestamp": "2025-09-02 14:11:01.345348", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.398123", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.3622997999191284, "timestamp": "2025-09-02 14:11:01.400248", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.453231", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.3333297073841095, "timestamp": "2025-09-02 14:11:01.455425", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.508032", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.3802996277809143, "timestamp": "2025-09-02 14:11:01.510328", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:01.563809", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.3215607702732086, "timestamp": "2025-09-02 14:11:01.569935", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:01.622280", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.2976091802120209, "timestamp": "2025-09-02 14:11:01.624597", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.679180", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.2989455461502075, "timestamp": "2025-09-02 14:11:01.681456", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.734285", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.36612507700920105, "timestamp": "2025-09-02 14:11:01.736490", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:01.789394", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.3733898997306824, "timestamp": "2025-09-02 14:11:01.795162", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:01.847033", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.25685375928878784, "timestamp": "2025-09-02 14:11:01.849180", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.902220", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.3431992828845978, "timestamp": "2025-09-02 14:11:01.904532", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:01.958387", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.26993322372436523, "timestamp": "2025-09-02 14:11:01.960492", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.012819", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.27196741104125977, "timestamp": "2025-09-02 14:11:02.018691", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.076931", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.21510900557041168, "timestamp": "2025-09-02 14:11:02.079253", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.132246", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.4839702248573303, "timestamp": "2025-09-02 14:11:02.134574", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.187331", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.4010538160800934, "timestamp": "2025-09-02 14:11:02.189961", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:02.242576", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.38624754548072815, "timestamp": "2025-09-02 14:11:02.252738", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.308186", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.2517738938331604, "timestamp": "2025-09-02 14:11:02.310350", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:02.363274", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.39301809668540955, "timestamp": "2025-09-02 14:11:02.365448", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:02.418044", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.3391234576702118, "timestamp": "2025-09-02 14:11:02.420146", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:02.472465", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.1925923377275467, "timestamp": "2025-09-02 14:11:02.478037", "step": 40, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:04.339927", "step": 40, "epoch": 1 }, { "type": "pplx", "content": 11757.3902444456, "timestamp": "2025-09-02 14:11:04.341950", "step": 40, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 40", "timestamp": "2025-09-02 14:11:04.755368", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:04.811670", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.35957810282707214, "timestamp": "2025-09-02 14:11:04.813809", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:04.867290", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.2525498867034912, "timestamp": "2025-09-02 14:11:04.869448", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:04.922242", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.31312912702560425, "timestamp": "2025-09-02 14:11:04.924441", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:04.979087", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.21224404871463776, "timestamp": "2025-09-02 14:11:04.985165", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.037171", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.2654156982898712, "timestamp": "2025-09-02 14:11:05.039524", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.092619", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.3601357340812683, "timestamp": "2025-09-02 14:11:05.094872", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.148087", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.286220520734787, "timestamp": "2025-09-02 14:11:05.150487", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.202820", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.18966296315193176, "timestamp": "2025-09-02 14:11:05.208694", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:05.260866", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.23035761713981628, "timestamp": "2025-09-02 14:11:05.263160", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:05.316370", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.5127808451652527, "timestamp": "2025-09-02 14:11:05.318616", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.371542", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.38094741106033325, "timestamp": "2025-09-02 14:11:05.373815", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.427905", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.30248627066612244, "timestamp": "2025-09-02 14:11:05.434125", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:05.486777", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.5653179287910461, "timestamp": "2025-09-02 14:11:05.489000", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.541324", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.3499302864074707, "timestamp": "2025-09-02 14:11:05.543609", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:05.596528", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.2980552017688751, "timestamp": "2025-09-02 14:11:05.598799", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.651405", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.2650551199913025, "timestamp": "2025-09-02 14:11:05.657695", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:05.709766", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.37913110852241516, "timestamp": "2025-09-02 14:11:05.711876", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:05.765069", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.20260180532932281, "timestamp": "2025-09-02 14:11:05.767296", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:05.819987", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.33883360028266907, "timestamp": "2025-09-02 14:11:05.822156", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:05.875269", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.24840442836284637, "timestamp": "2025-09-02 14:11:05.880796", "step": 60, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:07.810080", "step": 60, "epoch": 1 }, { "type": "pplx", "content": 10436.212008056213, "timestamp": "2025-09-02 14:11:07.812098", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:07.865193", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.06612782180309296, "timestamp": "2025-09-02 14:11:07.867399", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:07.921240", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.3780595362186432, "timestamp": "2025-09-02 14:11:07.923500", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:07.976883", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.2025892734527588, "timestamp": "2025-09-02 14:11:07.979032", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.032443", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.1940346360206604, "timestamp": "2025-09-02 14:11:08.038241", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.090089", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.223342627286911, "timestamp": "2025-09-02 14:11:08.092256", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:08.144850", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.41689103841781616, "timestamp": "2025-09-02 14:11:08.147256", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.200139", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.30422917008399963, "timestamp": "2025-09-02 14:11:08.202662", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:08.256421", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.15387417376041412, "timestamp": "2025-09-02 14:11:08.262585", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:08.314633", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.2716781497001648, "timestamp": "2025-09-02 14:11:08.317822", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:08.371096", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.16956454515457153, "timestamp": "2025-09-02 14:11:08.373358", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.427632", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.356372594833374, "timestamp": "2025-09-02 14:11:08.429799", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:08.484373", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.20348824560642242, "timestamp": "2025-09-02 14:11:08.492619", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.547094", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.2283041775226593, "timestamp": "2025-09-02 14:11:08.549125", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.601791", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.14195705950260162, "timestamp": "2025-09-02 14:11:08.603984", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.660254", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.20851697027683258, "timestamp": "2025-09-02 14:11:08.663004", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:08.716114", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.16509728133678436, "timestamp": "2025-09-02 14:11:08.721873", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.773661", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.22062787413597107, "timestamp": "2025-09-02 14:11:08.775954", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.828378", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.32037293910980225, "timestamp": "2025-09-02 14:11:08.830535", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.883490", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.38048428297042847, "timestamp": "2025-09-02 14:11:08.885807", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:08.938247", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.2635933458805084, "timestamp": "2025-09-02 14:11:08.944095", "step": 80, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:10.816727", "step": 80, "epoch": 1 }, { "type": "pplx", "content": 9534.48487514869, "timestamp": "2025-09-02 14:11:10.819247", "step": 80, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 80", "timestamp": "2025-09-02 14:11:11.183862", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:11.237520", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.2650878429412842, "timestamp": "2025-09-02 14:11:11.239924", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:11.294038", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.23925361037254333, "timestamp": "2025-09-02 14:11:11.296060", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:11.349836", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.17273873090744019, "timestamp": "2025-09-02 14:11:11.352140", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.407237", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.2792847454547882, "timestamp": "2025-09-02 14:11:11.413476", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.465396", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.20825059711933136, "timestamp": "2025-09-02 14:11:11.467754", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.520244", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.29281163215637207, "timestamp": "2025-09-02 14:11:11.522341", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.574937", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.23509106040000916, "timestamp": "2025-09-02 14:11:11.577086", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.629599", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.21337811648845673, "timestamp": "2025-09-02 14:11:11.635639", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:11.687714", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.22497226297855377, "timestamp": "2025-09-02 14:11:11.690006", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.742598", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.29963529109954834, "timestamp": "2025-09-02 14:11:11.744865", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:11.798865", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.1920796036720276, "timestamp": "2025-09-02 14:11:11.801110", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:11.853805", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.22429895401000977, "timestamp": "2025-09-02 14:11:11.859416", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.912550", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.1192128136754036, "timestamp": "2025-09-02 14:11:11.914692", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:11.967171", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.22166390717029572, "timestamp": "2025-09-02 14:11:11.969215", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:12.022111", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.2015092819929123, "timestamp": "2025-09-02 14:11:12.024136", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:12.076873", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.1826450526714325, "timestamp": "2025-09-02 14:11:12.082655", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:12.134920", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.23743358254432678, "timestamp": "2025-09-02 14:11:12.137217", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:12.190210", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.19413703680038452, "timestamp": "2025-09-02 14:11:12.192602", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:12.245217", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.21949802339076996, "timestamp": "2025-09-02 14:11:12.247436", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:12.299680", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.24238897860050201, "timestamp": "2025-09-02 14:11:12.305549", "step": 100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:14.157306", "step": 100, "epoch": 1 }, { "type": "pplx", "content": 8997.121001628613, "timestamp": "2025-09-02 14:11:14.159346", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:14.211815", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.20528535544872284, "timestamp": "2025-09-02 14:11:14.213838", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:14.267155", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.2262796312570572, "timestamp": "2025-09-02 14:11:14.269384", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.321806", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.1589289903640747, "timestamp": "2025-09-02 14:11:14.324101", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.377115", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.09831925481557846, "timestamp": "2025-09-02 14:11:14.383274", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.435585", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.38821232318878174, "timestamp": "2025-09-02 14:11:14.437971", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.490088", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.19901733100414276, "timestamp": "2025-09-02 14:11:14.492320", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.546716", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.3725179433822632, "timestamp": "2025-09-02 14:11:14.549290", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.601769", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.15710106492042542, "timestamp": "2025-09-02 14:11:14.607749", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:14.659781", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.18399730324745178, "timestamp": "2025-09-02 14:11:14.662086", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.714614", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.244423970580101, "timestamp": "2025-09-02 14:11:14.717017", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:14.769715", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.2965731620788574, "timestamp": "2025-09-02 14:11:14.771908", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.824298", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.24987180531024933, "timestamp": "2025-09-02 14:11:14.830423", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.882372", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.27532151341438293, "timestamp": "2025-09-02 14:11:14.886026", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.938966", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.2653392255306244, "timestamp": "2025-09-02 14:11:14.941350", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:14.993684", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.21873672306537628, "timestamp": "2025-09-02 14:11:14.996023", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:15.048797", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.2590881288051605, "timestamp": "2025-09-02 14:11:15.054788", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:15.107080", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.3191036880016327, "timestamp": "2025-09-02 14:11:15.109269", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:15.161615", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.17450746893882751, "timestamp": "2025-09-02 14:11:15.163830", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:15.215863", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.2758004069328308, "timestamp": "2025-09-02 14:11:15.218193", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:15.270592", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.1487528532743454, "timestamp": "2025-09-02 14:11:15.276718", "step": 120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:17.117736", "step": 120, "epoch": 1 }, { "type": "pplx", "content": 8711.237715327981, "timestamp": "2025-09-02 14:11:17.120133", "step": 120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 120", "timestamp": "2025-09-02 14:11:17.477793", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.534259", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.1641220599412918, "timestamp": "2025-09-02 14:11:17.536397", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.589900", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.3280048072338104, "timestamp": "2025-09-02 14:11:17.594868", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.650417", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.13661973178386688, "timestamp": "2025-09-02 14:11:17.652266", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.704113", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.34156885743141174, "timestamp": "2025-09-02 14:11:17.710267", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.762271", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.29836374521255493, "timestamp": "2025-09-02 14:11:17.764528", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.816745", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.23304621875286102, "timestamp": "2025-09-02 14:11:17.819177", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.872913", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.25545743107795715, "timestamp": "2025-09-02 14:11:17.875364", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.927799", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.13710877299308777, "timestamp": "2025-09-02 14:11:17.933824", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:17.985715", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.3372194170951843, "timestamp": "2025-09-02 14:11:17.987876", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.041461", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.21443058550357819, "timestamp": "2025-09-02 14:11:18.043699", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.096687", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.14444293081760406, "timestamp": "2025-09-02 14:11:18.098860", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.151626", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.1380845159292221, "timestamp": "2025-09-02 14:11:18.157701", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.209226", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.27670273184776306, "timestamp": "2025-09-02 14:11:18.211318", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.263774", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.20387355983257294, "timestamp": "2025-09-02 14:11:18.265920", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:18.318593", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.17558962106704712, "timestamp": "2025-09-02 14:11:18.320730", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.372765", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.22743868827819824, "timestamp": "2025-09-02 14:11:18.378559", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:18.430622", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.27827030420303345, "timestamp": "2025-09-02 14:11:18.432693", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.485171", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.20491473376750946, "timestamp": "2025-09-02 14:11:18.487597", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.540031", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.3780348300933838, "timestamp": "2025-09-02 14:11:18.542201", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:18.595146", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.1543915867805481, "timestamp": "2025-09-02 14:11:18.600819", "step": 140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:20.483659", "step": 140, "epoch": 1 }, { "type": "pplx", "content": 8641.082154800039, "timestamp": "2025-09-02 14:11:20.485585", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:20.537805", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.2231673151254654, "timestamp": "2025-09-02 14:11:20.540113", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:20.593193", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.2038366049528122, "timestamp": "2025-09-02 14:11:20.595490", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:20.648295", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.26577290892601013, "timestamp": "2025-09-02 14:11:20.650602", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:20.703532", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.21237285435199738, "timestamp": "2025-09-02 14:11:20.709963", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:20.762553", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.2300122231245041, "timestamp": "2025-09-02 14:11:20.765075", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:20.821888", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.26852089166641235, "timestamp": "2025-09-02 14:11:20.824329", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:20.877482", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.18954215943813324, "timestamp": "2025-09-02 14:11:20.880113", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:20.932671", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.3372737169265747, "timestamp": "2025-09-02 14:11:20.938562", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:20.990639", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.20788812637329102, "timestamp": "2025-09-02 14:11:20.992856", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.045057", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.12011725455522537, "timestamp": "2025-09-02 14:11:21.047113", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.099470", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.1590137779712677, "timestamp": "2025-09-02 14:11:21.101850", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:21.154694", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.17943251132965088, "timestamp": "2025-09-02 14:11:21.160855", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:21.212828", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.30411621928215027, "timestamp": "2025-09-02 14:11:21.214983", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.274129", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.28686875104904175, "timestamp": "2025-09-02 14:11:21.276539", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.328900", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.2284877598285675, "timestamp": "2025-09-02 14:11:21.331222", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.384192", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.1202070564031601, "timestamp": "2025-09-02 14:11:21.390052", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.442057", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.31093546748161316, "timestamp": "2025-09-02 14:11:21.444279", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.497065", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.22506728768348694, "timestamp": "2025-09-02 14:11:21.499243", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.552060", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.11343608796596527, "timestamp": "2025-09-02 14:11:21.554443", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:21.606913", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.19309087097644806, "timestamp": "2025-09-02 14:11:21.612808", "step": 160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:23.484237", "step": 160, "epoch": 1 }, { "type": "pplx", "content": 8558.367261575591, "timestamp": "2025-09-02 14:11:23.486944", "step": 160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 160", "timestamp": "2025-09-02 14:11:23.844462", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:23.900536", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.22142437100410461, "timestamp": "2025-09-02 14:11:23.902727", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:23.956604", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.13376504182815552, "timestamp": "2025-09-02 14:11:23.958980", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:24.012248", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.2710311710834503, "timestamp": "2025-09-02 14:11:24.014995", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.068632", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.242924764752388, "timestamp": "2025-09-02 14:11:24.074867", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.126293", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.14946642518043518, "timestamp": "2025-09-02 14:11:24.128599", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.181246", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.11319679766893387, "timestamp": "2025-09-02 14:11:24.183484", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.235745", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.15618744492530823, "timestamp": "2025-09-02 14:11:24.238024", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.290544", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.21619072556495667, "timestamp": "2025-09-02 14:11:24.296558", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.348318", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.17910590767860413, "timestamp": "2025-09-02 14:11:24.350717", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.404259", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.19981399178504944, "timestamp": "2025-09-02 14:11:24.406642", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.459943", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.1394142210483551, "timestamp": "2025-09-02 14:11:24.462202", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.515130", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.12695267796516418, "timestamp": "2025-09-02 14:11:24.520915", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.574288", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.13462139666080475, "timestamp": "2025-09-02 14:11:24.576828", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:24.630025", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.18055644631385803, "timestamp": "2025-09-02 14:11:24.632355", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.685168", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.2353421002626419, "timestamp": "2025-09-02 14:11:24.687595", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:24.741252", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.29011309146881104, "timestamp": "2025-09-02 14:11:24.747174", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:24.799445", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.1579749882221222, "timestamp": "2025-09-02 14:11:24.802028", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.854802", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.1445482075214386, "timestamp": "2025-09-02 14:11:24.857125", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.910803", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.15605200827121735, "timestamp": "2025-09-02 14:11:24.913141", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:24.966961", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.3074917197227478, "timestamp": "2025-09-02 14:11:24.972937", "step": 180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:26.839360", "step": 180, "epoch": 1 }, { "type": "pplx", "content": 8404.169488930242, "timestamp": "2025-09-02 14:11:26.841955", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:26.894587", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.12450110167264938, "timestamp": "2025-09-02 14:11:26.897138", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:26.951383", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.21766464412212372, "timestamp": "2025-09-02 14:11:26.955236", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.008927", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.1520894467830658, "timestamp": "2025-09-02 14:11:27.011366", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.064686", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.23636944591999054, "timestamp": "2025-09-02 14:11:27.071333", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.124623", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.41936594247817993, "timestamp": "2025-09-02 14:11:27.126768", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:27.179957", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.23426678776741028, "timestamp": "2025-09-02 14:11:27.182535", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.237416", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.1626775562763214, "timestamp": "2025-09-02 14:11:27.239736", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:27.294410", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.25492268800735474, "timestamp": "2025-09-02 14:11:27.300743", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.353926", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.21203386783599854, "timestamp": "2025-09-02 14:11:27.356238", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.410820", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.15729370713233948, "timestamp": "2025-09-02 14:11:27.412964", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.465817", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.18621526658535004, "timestamp": "2025-09-02 14:11:27.468018", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.520483", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.09764448553323746, "timestamp": "2025-09-02 14:11:27.526658", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.578339", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.18812237679958344, "timestamp": "2025-09-02 14:11:27.580649", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.634082", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.26325321197509766, "timestamp": "2025-09-02 14:11:27.636732", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.689871", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.3092162311077118, "timestamp": "2025-09-02 14:11:27.693012", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:27.747086", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.1902117133140564, "timestamp": "2025-09-02 14:11:27.753135", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:27.806080", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.19268517196178436, "timestamp": "2025-09-02 14:11:27.808536", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.861806", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.18671287596225739, "timestamp": "2025-09-02 14:11:27.864081", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.918708", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.18813113868236542, "timestamp": "2025-09-02 14:11:27.921020", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:27.975151", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.16217949986457825, "timestamp": "2025-09-02 14:11:27.981326", "step": 200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:29.859019", "step": 200, "epoch": 1 }, { "type": "pplx", "content": 8338.494751120272, "timestamp": "2025-09-02 14:11:29.860921", "step": 200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 200", "timestamp": "2025-09-02 14:11:30.231135", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:30.289405", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.12559588253498077, "timestamp": "2025-09-02 14:11:30.291807", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.346295", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.12275493144989014, "timestamp": "2025-09-02 14:11:30.348744", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.401741", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.24683508276939392, "timestamp": "2025-09-02 14:11:30.404086", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.457261", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.13924182951450348, "timestamp": "2025-09-02 14:11:30.463704", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:30.516341", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.18038642406463623, "timestamp": "2025-09-02 14:11:30.518746", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.571889", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.23362092673778534, "timestamp": "2025-09-02 14:11:30.574427", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:30.627191", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.2100873440504074, "timestamp": "2025-09-02 14:11:30.629739", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.682421", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.290188193321228, "timestamp": "2025-09-02 14:11:30.688981", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:30.740991", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.10338792949914932, "timestamp": "2025-09-02 14:11:30.743373", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.796802", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.2083229124546051, "timestamp": "2025-09-02 14:11:30.799575", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:30.852666", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.22840125858783722, "timestamp": "2025-09-02 14:11:30.855553", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:30.908634", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.15054598450660706, "timestamp": "2025-09-02 14:11:30.915186", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:30.967995", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.22725050151348114, "timestamp": "2025-09-02 14:11:30.970465", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:31.024222", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.23637203872203827, "timestamp": "2025-09-02 14:11:31.026464", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:31.079259", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.22682934999465942, "timestamp": "2025-09-02 14:11:31.081204", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:31.133531", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.1871100664138794, "timestamp": "2025-09-02 14:11:31.139636", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:31.191370", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.4629529118537903, "timestamp": "2025-09-02 14:11:31.193575", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:31.248347", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.1622951626777649, "timestamp": "2025-09-02 14:11:31.250606", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:31.303737", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.22988523542881012, "timestamp": "2025-09-02 14:11:31.305970", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:31.359238", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.16411375999450684, "timestamp": "2025-09-02 14:11:31.365430", "step": 220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:33.259371", "step": 220, "epoch": 1 }, { "type": "pplx", "content": 8206.606547469268, "timestamp": "2025-09-02 14:11:33.261689", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.313793", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.13657645881175995, "timestamp": "2025-09-02 14:11:33.316262", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:33.370939", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.2627924978733063, "timestamp": "2025-09-02 14:11:33.373098", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.427494", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.3759519159793854, "timestamp": "2025-09-02 14:11:33.429872", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.483607", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.2900035083293915, "timestamp": "2025-09-02 14:11:33.489877", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.542519", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.2744384706020355, "timestamp": "2025-09-02 14:11:33.544922", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:33.598332", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.20535513758659363, "timestamp": "2025-09-02 14:11:33.600598", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.653622", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.24257905781269073, "timestamp": "2025-09-02 14:11:33.655845", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.708641", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.19351902604103088, "timestamp": "2025-09-02 14:11:33.714932", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.768652", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.19246715307235718, "timestamp": "2025-09-02 14:11:33.770807", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.823922", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.26765140891075134, "timestamp": "2025-09-02 14:11:33.826188", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.879829", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.2606057822704315, "timestamp": "2025-09-02 14:11:33.881982", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:33.935330", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.1376591920852661, "timestamp": "2025-09-02 14:11:33.941522", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:33.995499", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.2524031102657318, "timestamp": "2025-09-02 14:11:33.997838", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:34.052161", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.30885934829711914, "timestamp": "2025-09-02 14:11:34.054898", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:34.109399", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.35216644406318665, "timestamp": "2025-09-02 14:11:34.111605", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:34.164411", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.2595987319946289, "timestamp": "2025-09-02 14:11:34.170437", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:34.222522", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.1879676878452301, "timestamp": "2025-09-02 14:11:34.224695", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:34.278399", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.1831701695919037, "timestamp": "2025-09-02 14:11:34.280602", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:34.333685", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.24739985167980194, "timestamp": "2025-09-02 14:11:34.335880", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:34.389194", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.18602381646633148, "timestamp": "2025-09-02 14:11:34.395155", "step": 240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:36.247753", "step": 240, "epoch": 1 }, { "type": "pplx", "content": 8107.147895707243, "timestamp": "2025-09-02 14:11:36.249373", "step": 240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 240", "timestamp": "2025-09-02 14:11:36.638539", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:36.695614", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.32137730717658997, "timestamp": "2025-09-02 14:11:36.697792", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:36.751724", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.24773040413856506, "timestamp": "2025-09-02 14:11:36.754134", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:36.807426", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.18468935787677765, "timestamp": "2025-09-02 14:11:36.809632", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:36.864109", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.24690841138362885, "timestamp": "2025-09-02 14:11:36.870493", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:36.923560", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.26695218682289124, "timestamp": "2025-09-02 14:11:36.925907", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:36.978788", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.26805809140205383, "timestamp": "2025-09-02 14:11:36.982190", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.035120", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.2250659316778183, "timestamp": "2025-09-02 14:11:37.037226", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:37.090114", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.19859763979911804, "timestamp": "2025-09-02 14:11:37.096506", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.148264", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.12188725173473358, "timestamp": "2025-09-02 14:11:37.150839", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:37.203608", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.2817313075065613, "timestamp": "2025-09-02 14:11:37.205590", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:37.258261", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.2612548768520355, "timestamp": "2025-09-02 14:11:37.260848", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:37.314303", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.18702000379562378, "timestamp": "2025-09-02 14:11:37.320326", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.372274", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.40859705209732056, "timestamp": "2025-09-02 14:11:37.374421", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:37.428196", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.1506912112236023, "timestamp": "2025-09-02 14:11:37.430295", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.482926", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.10864438861608505, "timestamp": "2025-09-02 14:11:37.485188", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:37.537826", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.28241872787475586, "timestamp": "2025-09-02 14:11:37.543710", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.595795", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.15065060555934906, "timestamp": "2025-09-02 14:11:37.598437", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:37.651348", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.21485954523086548, "timestamp": "2025-09-02 14:11:37.653693", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.708306", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.18508775532245636, "timestamp": "2025-09-02 14:11:37.710734", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:37.762884", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.17730647325515747, "timestamp": "2025-09-02 14:11:37.768579", "step": 260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:39.636123", "step": 260, "epoch": 1 }, { "type": "pplx", "content": 7951.349938729474, "timestamp": "2025-09-02 14:11:39.638780", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:39.692355", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.19420960545539856, "timestamp": "2025-09-02 14:11:39.694606", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:39.747967", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.18640749156475067, "timestamp": "2025-09-02 14:11:39.750446", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:39.804490", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.3022635281085968, "timestamp": "2025-09-02 14:11:39.806536", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:39.859481", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.26302093267440796, "timestamp": "2025-09-02 14:11:39.865812", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:39.917618", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.117647685110569, "timestamp": "2025-09-02 14:11:39.919765", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:39.973534", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.26924780011177063, "timestamp": "2025-09-02 14:11:39.975947", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.028530", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.3277054727077484, "timestamp": "2025-09-02 14:11:40.030613", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.084329", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.1641945242881775, "timestamp": "2025-09-02 14:11:40.090273", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.142503", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.18619287014007568, "timestamp": "2025-09-02 14:11:40.144811", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.198269", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.23994676768779755, "timestamp": "2025-09-02 14:11:40.203747", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.261524", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.16847094893455505, "timestamp": "2025-09-02 14:11:40.263870", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:40.317201", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.27865520119667053, "timestamp": "2025-09-02 14:11:40.323268", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.375477", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.15914484858512878, "timestamp": "2025-09-02 14:11:40.377542", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.430491", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.22165906429290771, "timestamp": "2025-09-02 14:11:40.432951", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.485397", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.16933892667293549, "timestamp": "2025-09-02 14:11:40.487970", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.540189", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.22679181396961212, "timestamp": "2025-09-02 14:11:40.546409", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.598260", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.18737545609474182, "timestamp": "2025-09-02 14:11:40.600412", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.653833", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.16737669706344604, "timestamp": "2025-09-02 14:11:40.655908", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.708122", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.3010156452655792, "timestamp": "2025-09-02 14:11:40.710369", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:40.763724", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.32296833395957947, "timestamp": "2025-09-02 14:11:40.769814", "step": 280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:42.638208", "step": 280, "epoch": 1 }, { "type": "pplx", "content": 7888.637050981771, "timestamp": "2025-09-02 14:11:42.640371", "step": 280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 280", "timestamp": "2025-09-02 14:11:43.017513", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.074680", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.19102539122104645, "timestamp": "2025-09-02 14:11:43.077573", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:43.132314", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.3259015381336212, "timestamp": "2025-09-02 14:11:43.134435", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.187613", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.2651236057281494, "timestamp": "2025-09-02 14:11:43.189827", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.243233", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.12241033464670181, "timestamp": "2025-09-02 14:11:43.249570", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:43.303110", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.17490163445472717, "timestamp": "2025-09-02 14:11:43.305491", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.358174", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.32618117332458496, "timestamp": "2025-09-02 14:11:43.360731", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:43.413856", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.1376430243253708, "timestamp": "2025-09-02 14:11:43.416485", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.469808", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.12636792659759521, "timestamp": "2025-09-02 14:11:43.475703", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.527875", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.22067885100841522, "timestamp": "2025-09-02 14:11:43.530350", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.583294", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.15151070058345795, "timestamp": "2025-09-02 14:11:43.586026", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:43.639312", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.2128380537033081, "timestamp": "2025-09-02 14:11:43.641563", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:43.694844", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.09154859185218811, "timestamp": "2025-09-02 14:11:43.700960", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.753423", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.36208575963974, "timestamp": "2025-09-02 14:11:43.755911", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.809264", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.28077632188796997, "timestamp": "2025-09-02 14:11:43.811796", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:43.865122", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.2669871151447296, "timestamp": "2025-09-02 14:11:43.867413", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:43.921224", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.1643238514661789, "timestamp": "2025-09-02 14:11:43.926966", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:43.980830", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.18140532076358795, "timestamp": "2025-09-02 14:11:43.986372", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:44.041105", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.13920722901821136, "timestamp": "2025-09-02 14:11:44.044455", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:44.098357", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.23993617296218872, "timestamp": "2025-09-02 14:11:44.100946", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:44.154385", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.14834249019622803, "timestamp": "2025-09-02 14:11:44.160682", "step": 300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:46.059071", "step": 300, "epoch": 1 }, { "type": "pplx", "content": 7860.3564301368815, "timestamp": "2025-09-02 14:11:46.061307", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.113832", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.2623777687549591, "timestamp": "2025-09-02 14:11:46.116298", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:46.170383", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.15459969639778137, "timestamp": "2025-09-02 14:11:46.172746", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.226222", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.12650948762893677, "timestamp": "2025-09-02 14:11:46.228198", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.281034", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.16587090492248535, "timestamp": "2025-09-02 14:11:46.287470", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.342273", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.2097996026277542, "timestamp": "2025-09-02 14:11:46.344468", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.397396", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.3103104829788208, "timestamp": "2025-09-02 14:11:46.399593", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.453674", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.23800285160541534, "timestamp": "2025-09-02 14:11:46.459389", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.515691", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.2639383375644684, "timestamp": "2025-09-02 14:11:46.521392", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.573060", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.26161691546440125, "timestamp": "2025-09-02 14:11:46.574940", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.627734", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.27204281091690063, "timestamp": "2025-09-02 14:11:46.629847", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.682280", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.13487358391284943, "timestamp": "2025-09-02 14:11:46.684550", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.736488", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.2292563021183014, "timestamp": "2025-09-02 14:11:46.742306", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:46.794593", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.15218134224414825, "timestamp": "2025-09-02 14:11:46.796897", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:46.850226", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.22023893892765045, "timestamp": "2025-09-02 14:11:46.852619", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:46.905330", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.250385582447052, "timestamp": "2025-09-02 14:11:46.907667", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:46.961329", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.17987047135829926, "timestamp": "2025-09-02 14:11:46.967596", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:47.020469", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.10755063593387604, "timestamp": "2025-09-02 14:11:47.022720", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:47.075552", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.11269817501306534, "timestamp": "2025-09-02 14:11:47.077615", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:47.130270", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.14703917503356934, "timestamp": "2025-09-02 14:11:47.132486", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:47.185203", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.2446051388978958, "timestamp": "2025-09-02 14:11:47.191585", "step": 320, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:49.034165", "step": 320, "epoch": 1 }, { "type": "pplx", "content": 7879.960617175855, "timestamp": "2025-09-02 14:11:49.038211", "step": 320, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 320", "timestamp": "2025-09-02 14:11:49.398964", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.454517", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.30709829926490784, "timestamp": "2025-09-02 14:11:49.456953", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.510200", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.2088862657546997, "timestamp": "2025-09-02 14:11:49.512419", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.565770", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.22018271684646606, "timestamp": "2025-09-02 14:11:49.568448", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.628606", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.11273863166570663, "timestamp": "2025-09-02 14:11:49.634592", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:49.686061", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.24738889932632446, "timestamp": "2025-09-02 14:11:49.688195", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.741154", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.25922316312789917, "timestamp": "2025-09-02 14:11:49.742956", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.795781", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.16098614037036896, "timestamp": "2025-09-02 14:11:49.798473", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:49.851622", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.3729935884475708, "timestamp": "2025-09-02 14:11:49.857643", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.910521", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.1728862226009369, "timestamp": "2025-09-02 14:11:49.913040", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:49.965973", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.14608335494995117, "timestamp": "2025-09-02 14:11:49.968234", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.020857", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.29661113023757935, "timestamp": "2025-09-02 14:11:50.023105", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.075784", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.19156232476234436, "timestamp": "2025-09-02 14:11:50.081334", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.133463", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.3583519160747528, "timestamp": "2025-09-02 14:11:50.135301", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:50.188187", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.1855548918247223, "timestamp": "2025-09-02 14:11:50.190235", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.243098", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.329773485660553, "timestamp": "2025-09-02 14:11:50.245345", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.298321", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.30350595712661743, "timestamp": "2025-09-02 14:11:50.304168", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.355889", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.05443538725376129, "timestamp": "2025-09-02 14:11:50.358375", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:50.410972", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.27829498052597046, "timestamp": "2025-09-02 14:11:50.413339", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.468440", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.1455927938222885, "timestamp": "2025-09-02 14:11:50.470709", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:50.523833", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.10769162327051163, "timestamp": "2025-09-02 14:11:50.529470", "step": 340, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:52.397357", "step": 340, "epoch": 1 }, { "type": "pplx", "content": 7865.934276791511, "timestamp": "2025-09-02 14:11:52.399469", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.452059", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.16536718606948853, "timestamp": "2025-09-02 14:11:52.454227", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.507178", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.23829719424247742, "timestamp": "2025-09-02 14:11:52.509559", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:52.563242", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.17271599173545837, "timestamp": "2025-09-02 14:11:52.565978", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.619728", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.22334463894367218, "timestamp": "2025-09-02 14:11:52.625783", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.679560", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.22640252113342285, "timestamp": "2025-09-02 14:11:52.682329", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.736235", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.14908330142498016, "timestamp": "2025-09-02 14:11:52.738886", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:52.791409", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.1864229142665863, "timestamp": "2025-09-02 14:11:52.793641", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.845948", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.2839997410774231, "timestamp": "2025-09-02 14:11:52.851423", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:52.902628", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.17696379125118256, "timestamp": "2025-09-02 14:11:52.904894", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:52.958162", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.3115480840206146, "timestamp": "2025-09-02 14:11:52.960477", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.013264", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.12853959202766418, "timestamp": "2025-09-02 14:11:53.017274", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:53.070109", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.2619737684726715, "timestamp": "2025-09-02 14:11:53.075982", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:53.128732", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.21577659249305725, "timestamp": "2025-09-02 14:11:53.131145", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.183415", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.1827879101037979, "timestamp": "2025-09-02 14:11:53.185959", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.239224", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.09786248207092285, "timestamp": "2025-09-02 14:11:53.241372", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:53.293844", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.32092446088790894, "timestamp": "2025-09-02 14:11:53.299626", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.351955", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.17514856159687042, "timestamp": "2025-09-02 14:11:53.353904", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.406129", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.2227691411972046, "timestamp": "2025-09-02 14:11:53.408375", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:53.460887", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.19636264443397522, "timestamp": "2025-09-02 14:11:53.463001", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:53.515133", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.1950041502714157, "timestamp": "2025-09-02 14:11:53.521163", "step": 360, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:55.381170", "step": 360, "epoch": 1 }, { "type": "pplx", "content": 7934.659814462033, "timestamp": "2025-09-02 14:11:55.383381", "step": 360, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 360", "timestamp": "2025-09-02 14:11:55.751792", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:55.807235", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.1930122971534729, "timestamp": "2025-09-02 14:11:55.809544", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:55.864376", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.3382236659526825, "timestamp": "2025-09-02 14:11:55.866612", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:55.919507", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.46654754877090454, "timestamp": "2025-09-02 14:11:55.921699", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:55.974836", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.1052594780921936, "timestamp": "2025-09-02 14:11:55.981095", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.033158", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.3825656771659851, "timestamp": "2025-09-02 14:11:56.035482", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:56.088489", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.10555469244718552, "timestamp": "2025-09-02 14:11:56.090698", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.143174", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.23223817348480225, "timestamp": "2025-09-02 14:11:56.145537", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.198127", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.18667000532150269, "timestamp": "2025-09-02 14:11:56.204326", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.256422", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.39615896344184875, "timestamp": "2025-09-02 14:11:56.258596", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:56.312189", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.09070011973381042, "timestamp": "2025-09-02 14:11:56.314577", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.368546", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.4128700792789459, "timestamp": "2025-09-02 14:11:56.370949", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.424023", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.18803958594799042, "timestamp": "2025-09-02 14:11:56.430257", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.483407", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.12104673683643341, "timestamp": "2025-09-02 14:11:56.486089", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.538522", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.31144165992736816, "timestamp": "2025-09-02 14:11:56.540621", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:56.593430", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.23134464025497437, "timestamp": "2025-09-02 14:11:56.595871", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.648446", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.1689949631690979, "timestamp": "2025-09-02 14:11:56.653903", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.705822", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.13246819376945496, "timestamp": "2025-09-02 14:11:56.708495", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.762063", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.2212774008512497, "timestamp": "2025-09-02 14:11:56.764266", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:56.826375", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.09750914573669434, "timestamp": "2025-09-02 14:11:56.829276", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:56.882881", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.2706238031387329, "timestamp": "2025-09-02 14:11:56.888772", "step": 380, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:11:58.779190", "step": 380, "epoch": 1 }, { "type": "pplx", "content": 7920.750578928217, "timestamp": "2025-09-02 14:11:58.780851", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:58.832818", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.15898418426513672, "timestamp": "2025-09-02 14:11:58.834945", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:58.888522", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.2235444188117981, "timestamp": "2025-09-02 14:11:58.890484", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:58.943317", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.24983066320419312, "timestamp": "2025-09-02 14:11:58.945128", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:58.997711", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.1879381686449051, "timestamp": "2025-09-02 14:11:59.003922", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.057129", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.19686496257781982, "timestamp": "2025-09-02 14:11:59.059354", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:59.112303", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.37209951877593994, "timestamp": "2025-09-02 14:11:59.114817", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:59.168535", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.19846293330192566, "timestamp": "2025-09-02 14:11:59.170901", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.223486", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.24126803874969482, "timestamp": "2025-09-02 14:11:59.229855", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:59.282281", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.30669310688972473, "timestamp": "2025-09-02 14:11:59.284602", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.337519", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.10954746603965759, "timestamp": "2025-09-02 14:11:59.339799", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:11:59.392940", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.094220831990242, "timestamp": "2025-09-02 14:11:59.395193", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.447855", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.18358463048934937, "timestamp": "2025-09-02 14:11:59.453823", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.506904", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.1629645675420761, "timestamp": "2025-09-02 14:11:59.509068", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.561369", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.14648005366325378, "timestamp": "2025-09-02 14:11:59.563817", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.616027", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.2405136078596115, "timestamp": "2025-09-02 14:11:59.618279", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.671177", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.2650720775127411, "timestamp": "2025-09-02 14:11:59.677997", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:11:59.730429", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.3725948929786682, "timestamp": "2025-09-02 14:11:59.732835", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.786053", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.3094664216041565, "timestamp": "2025-09-02 14:11:59.788575", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.841265", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.15248140692710876, "timestamp": "2025-09-02 14:11:59.843428", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:11:59.896139", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.24742688238620758, "timestamp": "2025-09-02 14:11:59.902143", "step": 400, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:01.820221", "step": 400, "epoch": 1 }, { "type": "pplx", "content": 7868.416223922567, "timestamp": "2025-09-02 14:12:01.822419", "step": 400, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 400", "timestamp": "2025-09-02 14:12:02.200158", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.258256", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.20945453643798828, "timestamp": "2025-09-02 14:12:02.260342", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.314331", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.26058873534202576, "timestamp": "2025-09-02 14:12:02.317128", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.369739", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.13173873722553253, "timestamp": "2025-09-02 14:12:02.372101", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:02.424855", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.2744247615337372, "timestamp": "2025-09-02 14:12:02.431048", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.483263", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.27065834403038025, "timestamp": "2025-09-02 14:12:02.485264", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.537476", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.19585488736629486, "timestamp": "2025-09-02 14:12:02.539685", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:02.593094", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.15754683315753937, "timestamp": "2025-09-02 14:12:02.595392", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:02.648389", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.11081890016794205, "timestamp": "2025-09-02 14:12:02.654199", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.705634", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.18417030572891235, "timestamp": "2025-09-02 14:12:02.707817", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.760610", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.31176531314849854, "timestamp": "2025-09-02 14:12:02.762525", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:02.815384", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.19728145003318787, "timestamp": "2025-09-02 14:12:02.817698", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.870781", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.13057342171669006, "timestamp": "2025-09-02 14:12:02.876483", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.929301", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.23301728069782257, "timestamp": "2025-09-02 14:12:02.931393", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:02.984548", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.12916897237300873, "timestamp": "2025-09-02 14:12:02.986754", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:03.039763", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.17895767092704773, "timestamp": "2025-09-02 14:12:03.041783", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:03.094477", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.08916999399662018, "timestamp": "2025-09-02 14:12:03.100552", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:03.152565", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.2093767672777176, "timestamp": "2025-09-02 14:12:03.154565", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:03.207065", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.1329169124364853, "timestamp": "2025-09-02 14:12:03.209377", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:03.261863", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.2013414204120636, "timestamp": "2025-09-02 14:12:03.264020", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:03.315937", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.30560269951820374, "timestamp": "2025-09-02 14:12:03.321813", "step": 420, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:05.194397", "step": 420, "epoch": 1 }, { "type": "pplx", "content": 7934.276808824245, "timestamp": "2025-09-02 14:12:05.196818", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.249445", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.14219453930854797, "timestamp": "2025-09-02 14:12:05.251841", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.304389", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.2583293318748474, "timestamp": "2025-09-02 14:12:05.306704", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.359128", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.1615259349346161, "timestamp": "2025-09-02 14:12:05.361696", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.414270", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.22063878178596497, "timestamp": "2025-09-02 14:12:05.420773", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.472693", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.16630089282989502, "timestamp": "2025-09-02 14:12:05.475074", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.528168", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.2939158082008362, "timestamp": "2025-09-02 14:12:05.530386", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:05.583312", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.20497268438339233, "timestamp": "2025-09-02 14:12:05.585718", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.638411", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.22515158355236053, "timestamp": "2025-09-02 14:12:05.644288", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.696482", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.11470671743154526, "timestamp": "2025-09-02 14:12:05.698822", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:05.751550", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.2509019076824188, "timestamp": "2025-09-02 14:12:05.753875", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:05.806629", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.30088552832603455, "timestamp": "2025-09-02 14:12:05.809320", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.862150", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.20051656663417816, "timestamp": "2025-09-02 14:12:05.868043", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.919513", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.20523899793624878, "timestamp": "2025-09-02 14:12:05.921821", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:05.974545", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.14520666003227234, "timestamp": "2025-09-02 14:12:05.977077", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:06.029688", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.19668111205101013, "timestamp": "2025-09-02 14:12:06.032478", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:06.085825", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.20132692158222198, "timestamp": "2025-09-02 14:12:06.091614", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:06.143535", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.2703307569026947, "timestamp": "2025-09-02 14:12:06.145847", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:06.199296", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.11128479242324829, "timestamp": "2025-09-02 14:12:06.201330", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:06.253874", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.15917548537254333, "timestamp": "2025-09-02 14:12:06.257367", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:06.309753", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.10931098461151123, "timestamp": "2025-09-02 14:12:06.315902", "step": 440, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:08.176681", "step": 440, "epoch": 1 }, { "type": "pplx", "content": 8078.855748072446, "timestamp": "2025-09-02 14:12:08.178928", "step": 440, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 440", "timestamp": "2025-09-02 14:12:08.539822", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.595744", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.30737125873565674, "timestamp": "2025-09-02 14:12:08.597982", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:08.651041", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.29249483346939087, "timestamp": "2025-09-02 14:12:08.653117", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.706197", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.19619552791118622, "timestamp": "2025-09-02 14:12:08.708461", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.761106", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.16840049624443054, "timestamp": "2025-09-02 14:12:08.767223", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:08.819662", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.148340106010437, "timestamp": "2025-09-02 14:12:08.821876", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.875036", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.20711186528205872, "timestamp": "2025-09-02 14:12:08.877501", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.931530", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.1976381242275238, "timestamp": "2025-09-02 14:12:08.933823", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:08.986526", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.19532735645771027, "timestamp": "2025-09-02 14:12:08.992308", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.044533", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.304631769657135, "timestamp": "2025-09-02 14:12:09.046750", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.099596", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.2371954619884491, "timestamp": "2025-09-02 14:12:09.101858", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:09.154764", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.25953948497772217, "timestamp": "2025-09-02 14:12:09.157110", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.209763", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.16590219736099243, "timestamp": "2025-09-02 14:12:09.215904", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:12:09.277243", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.18301482498645782, "timestamp": "2025-09-02 14:12:09.279599", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:09.333691", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.19958895444869995, "timestamp": "2025-09-02 14:12:09.336014", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.388983", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.2948741614818573, "timestamp": "2025-09-02 14:12:09.391038", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.443267", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.5643308162689209, "timestamp": "2025-09-02 14:12:09.449326", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.501597", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.2348475307226181, "timestamp": "2025-09-02 14:12:09.503848", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.556692", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.09766942262649536, "timestamp": "2025-09-02 14:12:09.558904", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.612436", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.18156063556671143, "timestamp": "2025-09-02 14:12:09.614391", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:09.667118", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.20348617434501648, "timestamp": "2025-09-02 14:12:09.672954", "step": 460, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:11.524040", "step": 460, "epoch": 1 }, { "type": "pplx", "content": 8084.41091734419, "timestamp": "2025-09-02 14:12:11.536100", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.594325", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.126946359872818, "timestamp": "2025-09-02 14:12:11.596469", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.650080", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.11848227679729462, "timestamp": "2025-09-02 14:12:11.652496", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.705062", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.30987879633903503, "timestamp": "2025-09-02 14:12:11.707260", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.760754", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.09394294768571854, "timestamp": "2025-09-02 14:12:11.766783", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.818481", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.13483382761478424, "timestamp": "2025-09-02 14:12:11.820740", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:11.873541", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.11192884296178818, "timestamp": "2025-09-02 14:12:11.875427", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.928392", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.325011670589447, "timestamp": "2025-09-02 14:12:11.930578", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:11.983351", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.11741884797811508, "timestamp": "2025-09-02 14:12:11.989353", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.042367", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.1551104187965393, "timestamp": "2025-09-02 14:12:12.044698", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.099044", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.18945662677288055, "timestamp": "2025-09-02 14:12:12.101208", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:12.155368", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.27370786666870117, "timestamp": "2025-09-02 14:12:12.157699", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:12.211101", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.14143025875091553, "timestamp": "2025-09-02 14:12:12.217109", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:12.269122", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.1504458785057068, "timestamp": "2025-09-02 14:12:12.271423", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:12.323947", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.14726395905017853, "timestamp": "2025-09-02 14:12:12.326324", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.379157", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.06388150900602341, "timestamp": "2025-09-02 14:12:12.381337", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.433564", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.31098493933677673, "timestamp": "2025-09-02 14:12:12.439988", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:12.492591", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.10325179249048233, "timestamp": "2025-09-02 14:12:12.494721", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.547353", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.29745379090309143, "timestamp": "2025-09-02 14:12:12.549721", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.602145", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.17632707953453064, "timestamp": "2025-09-02 14:12:12.604488", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:12.656600", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.17138060927391052, "timestamp": "2025-09-02 14:12:12.662515", "step": 480, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:14.524152", "step": 480, "epoch": 1 }, { "type": "pplx", "content": 8101.271656174691, "timestamp": "2025-09-02 14:12:14.526304", "step": 480, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 480", "timestamp": "2025-09-02 14:12:14.895860", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:14.953337", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.1618996113538742, "timestamp": "2025-09-02 14:12:14.955655", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.010624", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.26317426562309265, "timestamp": "2025-09-02 14:12:15.012898", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.065797", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.11772159487009048, "timestamp": "2025-09-02 14:12:15.069449", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.121829", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.1445346176624298, "timestamp": "2025-09-02 14:12:15.128644", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.181080", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.19072090089321136, "timestamp": "2025-09-02 14:12:15.183257", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.235984", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.2421288639307022, "timestamp": "2025-09-02 14:12:15.237863", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:15.290702", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.29391947388648987, "timestamp": "2025-09-02 14:12:15.292982", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.346353", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.29985833168029785, "timestamp": "2025-09-02 14:12:15.352780", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.405394", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.15421190857887268, "timestamp": "2025-09-02 14:12:15.407760", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.461225", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.1338934600353241, "timestamp": "2025-09-02 14:12:15.463994", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.518561", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.19187965989112854, "timestamp": "2025-09-02 14:12:15.520973", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.573560", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.2645076513290405, "timestamp": "2025-09-02 14:12:15.580690", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:15.633587", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.16731980443000793, "timestamp": "2025-09-02 14:12:15.636206", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.688845", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.12692071497440338, "timestamp": "2025-09-02 14:12:15.691014", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.744482", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.25775790214538574, "timestamp": "2025-09-02 14:12:15.746715", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:15.800262", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.19151270389556885, "timestamp": "2025-09-02 14:12:15.806320", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.859059", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.20017877221107483, "timestamp": "2025-09-02 14:12:15.861526", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.914513", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.23995327949523926, "timestamp": "2025-09-02 14:12:15.916937", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:15.969777", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.15123377740383148, "timestamp": "2025-09-02 14:12:15.971825", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:16.025434", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.2253989428281784, "timestamp": "2025-09-02 14:12:16.031430", "step": 500, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:17.891685", "step": 500, "epoch": 1 }, { "type": "pplx", "content": 8109.551240634742, "timestamp": "2025-09-02 14:12:17.895726", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:17.949772", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.19867892563343048, "timestamp": "2025-09-02 14:12:17.952152", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.005676", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.4105687141418457, "timestamp": "2025-09-02 14:12:18.008326", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.061115", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.263882577419281, "timestamp": "2025-09-02 14:12:18.063384", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.116119", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.1686132848262787, "timestamp": "2025-09-02 14:12:18.122163", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.173547", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.26908907294273376, "timestamp": "2025-09-02 14:12:18.175724", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.228113", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.266213595867157, "timestamp": "2025-09-02 14:12:18.230318", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.282605", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.15929871797561646, "timestamp": "2025-09-02 14:12:18.284500", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.336786", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.41836681962013245, "timestamp": "2025-09-02 14:12:18.342944", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:18.395307", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.37175512313842773, "timestamp": "2025-09-02 14:12:18.397510", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.450134", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.2951255738735199, "timestamp": "2025-09-02 14:12:18.452326", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.504880", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.23821453750133514, "timestamp": "2025-09-02 14:12:18.507043", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.560187", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.16826188564300537, "timestamp": "2025-09-02 14:12:18.566225", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.618574", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.1585489958524704, "timestamp": "2025-09-02 14:12:18.620697", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.673311", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.23239484429359436, "timestamp": "2025-09-02 14:12:18.675731", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.728467", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.24080470204353333, "timestamp": "2025-09-02 14:12:18.730575", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.782806", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.12067770212888718, "timestamp": "2025-09-02 14:12:18.788864", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.841125", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.2397547960281372, "timestamp": "2025-09-02 14:12:18.843387", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.895625", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.25557515025138855, "timestamp": "2025-09-02 14:12:18.897706", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:18.949867", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.1338682919740677, "timestamp": "2025-09-02 14:12:18.951920", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:19.005048", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.24343588948249817, "timestamp": "2025-09-02 14:12:19.010785", "step": 520, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:20.858827", "step": 520, "epoch": 1 }, { "type": "pplx", "content": 8056.258359741721, "timestamp": "2025-09-02 14:12:20.861204", "step": 520, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 520", "timestamp": "2025-09-02 14:12:21.217555", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.271186", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.21524709463119507, "timestamp": "2025-09-02 14:12:21.273808", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.327666", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.24037283658981323, "timestamp": "2025-09-02 14:12:21.329865", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.382398", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.17394421994686127, "timestamp": "2025-09-02 14:12:21.384521", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.437276", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.11603036522865295, "timestamp": "2025-09-02 14:12:21.443249", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.495270", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.20078644156455994, "timestamp": "2025-09-02 14:12:21.497493", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.550234", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.275970458984375, "timestamp": "2025-09-02 14:12:21.553171", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:21.606251", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.31593137979507446, "timestamp": "2025-09-02 14:12:21.608718", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:21.661525", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.22388286888599396, "timestamp": "2025-09-02 14:12:21.667203", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:21.719238", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.14003543555736542, "timestamp": "2025-09-02 14:12:21.721747", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.776780", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.1461782455444336, "timestamp": "2025-09-02 14:12:21.778861", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.831777", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.2122151255607605, "timestamp": "2025-09-02 14:12:21.834173", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.887433", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.23945625126361847, "timestamp": "2025-09-02 14:12:21.893725", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:21.945688", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.24976438283920288, "timestamp": "2025-09-02 14:12:21.947899", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:22.001686", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.12598933279514313, "timestamp": "2025-09-02 14:12:22.003787", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:22.056201", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.10549245774745941, "timestamp": "2025-09-02 14:12:22.059427", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:22.111744", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.22029969096183777, "timestamp": "2025-09-02 14:12:22.118510", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:22.170286", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.2900339365005493, "timestamp": "2025-09-02 14:12:22.172511", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:22.226563", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.1720549315214157, "timestamp": "2025-09-02 14:12:22.228923", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:22.281387", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.27897489070892334, "timestamp": "2025-09-02 14:12:22.283559", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:22.336512", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.16984623670578003, "timestamp": "2025-09-02 14:12:22.342092", "step": 540, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:24.186812", "step": 540, "epoch": 1 }, { "type": "pplx", "content": 8037.772364531754, "timestamp": "2025-09-02 14:12:24.188918", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.240913", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.2469266802072525, "timestamp": "2025-09-02 14:12:24.242823", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.295666", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.2879405915737152, "timestamp": "2025-09-02 14:12:24.297973", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:24.350651", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.20818738639354706, "timestamp": "2025-09-02 14:12:24.352876", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.405626", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.1932748258113861, "timestamp": "2025-09-02 14:12:24.411758", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.463731", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.15431034564971924, "timestamp": "2025-09-02 14:12:24.466006", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.519119", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.21393738687038422, "timestamp": "2025-09-02 14:12:24.521270", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:24.574333", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.2285403609275818, "timestamp": "2025-09-02 14:12:24.576622", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:24.629406", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.17590785026550293, "timestamp": "2025-09-02 14:12:24.635459", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.687481", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.13367512822151184, "timestamp": "2025-09-02 14:12:24.691138", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.742962", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.1299186497926712, "timestamp": "2025-09-02 14:12:24.745300", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:24.798753", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.2013091742992401, "timestamp": "2025-09-02 14:12:24.800991", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:24.853839", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.19988110661506653, "timestamp": "2025-09-02 14:12:24.859802", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.911584", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.15382106602191925, "timestamp": "2025-09-02 14:12:24.913938", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:24.966132", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.21833764016628265, "timestamp": "2025-09-02 14:12:24.968320", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:25.021031", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.21603110432624817, "timestamp": "2025-09-02 14:12:25.023272", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:25.076060", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.2347061187028885, "timestamp": "2025-09-02 14:12:25.082087", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:25.134030", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.15172335505485535, "timestamp": "2025-09-02 14:12:25.136334", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:25.190297", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.10517603904008865, "timestamp": "2025-09-02 14:12:25.193335", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:25.246379", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.1435347944498062, "timestamp": "2025-09-02 14:12:25.248573", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:25.301427", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.30008888244628906, "timestamp": "2025-09-02 14:12:25.307167", "step": 560, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:27.165794", "step": 560, "epoch": 1 }, { "type": "pplx", "content": 8025.974011259475, "timestamp": "2025-09-02 14:12:27.168102", "step": 560, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 560", "timestamp": "2025-09-02 14:12:27.550060", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:27.604553", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.37707850337028503, "timestamp": "2025-09-02 14:12:27.606822", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:27.661098", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.18437258899211884, "timestamp": "2025-09-02 14:12:27.663278", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:27.716449", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.39019161462783813, "timestamp": "2025-09-02 14:12:27.719204", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:27.773276", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.1949104368686676, "timestamp": "2025-09-02 14:12:27.779564", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:27.831716", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.20049934089183807, "timestamp": "2025-09-02 14:12:27.833653", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:27.885787", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.21797725558280945, "timestamp": "2025-09-02 14:12:27.888024", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:27.940820", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.08693860471248627, "timestamp": "2025-09-02 14:12:27.943185", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:27.995780", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.12672528624534607, "timestamp": "2025-09-02 14:12:28.001701", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.053797", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.3194925785064697, "timestamp": "2025-09-02 14:12:28.056073", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.109165", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.20399391651153564, "timestamp": "2025-09-02 14:12:28.111343", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:28.163885", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.2891095280647278, "timestamp": "2025-09-02 14:12:28.166407", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.219425", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.22083020210266113, "timestamp": "2025-09-02 14:12:28.225217", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.278476", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.17427320778369904, "timestamp": "2025-09-02 14:12:28.280418", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.333063", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.10472602397203445, "timestamp": "2025-09-02 14:12:28.335145", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.388174", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.23935921490192413, "timestamp": "2025-09-02 14:12:28.390760", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:28.444025", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.12245737761259079, "timestamp": "2025-09-02 14:12:28.450049", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.502755", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.3243488371372223, "timestamp": "2025-09-02 14:12:28.504964", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.559127", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.21100397408008575, "timestamp": "2025-09-02 14:12:28.561382", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:28.614370", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.1530246138572693, "timestamp": "2025-09-02 14:12:28.616706", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:28.669468", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.39402878284454346, "timestamp": "2025-09-02 14:12:28.675487", "step": 580, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:30.539084", "step": 580, "epoch": 1 }, { "type": "pplx", "content": 8016.311832381298, "timestamp": "2025-09-02 14:12:30.541150", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:30.592808", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.19938622415065765, "timestamp": "2025-09-02 14:12:30.595533", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:30.648806", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.19479770958423615, "timestamp": "2025-09-02 14:12:30.650983", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:30.703753", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.288784384727478, "timestamp": "2025-09-02 14:12:30.706039", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:30.758682", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.23932312428951263, "timestamp": "2025-09-02 14:12:30.764720", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:30.816923", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.16811004281044006, "timestamp": "2025-09-02 14:12:30.819602", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:30.872593", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.1545003056526184, "timestamp": "2025-09-02 14:12:30.874508", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:30.927163", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.21501733362674713, "timestamp": "2025-09-02 14:12:30.929511", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:30.982301", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.15239772200584412, "timestamp": "2025-09-02 14:12:30.988445", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.040631", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.4231335520744324, "timestamp": "2025-09-02 14:12:31.042770", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.094975", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.2375190258026123, "timestamp": "2025-09-02 14:12:31.097317", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.149496", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.11529960483312607, "timestamp": "2025-09-02 14:12:31.151830", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.205040", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.14006307721138, "timestamp": "2025-09-02 14:12:31.210813", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.262377", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.24637471139431, "timestamp": "2025-09-02 14:12:31.264618", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.317790", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.20229710638523102, "timestamp": "2025-09-02 14:12:31.319966", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.372230", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.37702369689941406, "timestamp": "2025-09-02 14:12:31.376107", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:31.429830", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.24962574243545532, "timestamp": "2025-09-02 14:12:31.435864", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:31.488004", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.16404475271701813, "timestamp": "2025-09-02 14:12:31.490328", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.543227", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.14978386461734772, "timestamp": "2025-09-02 14:12:31.545769", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.598966", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.14341861009597778, "timestamp": "2025-09-02 14:12:31.601313", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:31.654794", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.2877383530139923, "timestamp": "2025-09-02 14:12:31.660532", "step": 600, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:33.522409", "step": 600, "epoch": 1 }, { "type": "pplx", "content": 7968.813598605782, "timestamp": "2025-09-02 14:12:33.525211", "step": 600, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 600", "timestamp": "2025-09-02 14:12:33.954903", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.010566", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.18870899081230164, "timestamp": "2025-09-02 14:12:34.012933", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.066909", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.16435538232326508, "timestamp": "2025-09-02 14:12:34.068966", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.121807", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.11309024691581726, "timestamp": "2025-09-02 14:12:34.124206", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.176818", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.13336122035980225, "timestamp": "2025-09-02 14:12:34.183048", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.234908", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.21555042266845703, "timestamp": "2025-09-02 14:12:34.237093", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.289615", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.3345740735530853, "timestamp": "2025-09-02 14:12:34.291588", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.343705", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.1708202064037323, "timestamp": "2025-09-02 14:12:34.345741", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.398071", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.17423295974731445, "timestamp": "2025-09-02 14:12:34.404068", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:34.457901", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.13439247012138367, "timestamp": "2025-09-02 14:12:34.460519", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:34.513561", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.19716106355190277, "timestamp": "2025-09-02 14:12:34.517155", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.571682", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.13293036818504333, "timestamp": "2025-09-02 14:12:34.574360", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.628890", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.18210354447364807, "timestamp": "2025-09-02 14:12:34.634948", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:34.687638", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.12907618284225464, "timestamp": "2025-09-02 14:12:34.690031", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.742561", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.23830890655517578, "timestamp": "2025-09-02 14:12:34.744882", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.797493", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.26929014921188354, "timestamp": "2025-09-02 14:12:34.799594", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.852254", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.26516860723495483, "timestamp": "2025-09-02 14:12:34.858292", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:34.910525", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.1559997946023941, "timestamp": "2025-09-02 14:12:34.912588", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:34.966847", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.09411335736513138, "timestamp": "2025-09-02 14:12:34.969067", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:35.021762", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.1357458233833313, "timestamp": "2025-09-02 14:12:35.023939", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:35.076939", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.24448956549167633, "timestamp": "2025-09-02 14:12:35.082410", "step": 620, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:36.931353", "step": 620, "epoch": 1 }, { "type": "pplx", "content": 7903.616993049431, "timestamp": "2025-09-02 14:12:36.933590", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:36.985836", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.17825831472873688, "timestamp": "2025-09-02 14:12:36.987711", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.040134", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.2853935956954956, "timestamp": "2025-09-02 14:12:37.042463", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.094546", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.2902543544769287, "timestamp": "2025-09-02 14:12:37.098253", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.151024", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.1492205113172531, "timestamp": "2025-09-02 14:12:37.157921", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:37.210134", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.17582465708255768, "timestamp": "2025-09-02 14:12:37.212695", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.265805", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.3323514461517334, "timestamp": "2025-09-02 14:12:37.268027", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.320939", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.16619913280010223, "timestamp": "2025-09-02 14:12:37.326579", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:37.382933", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.04043610394001007, "timestamp": "2025-09-02 14:12:37.389836", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.442941", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.3857656419277191, "timestamp": "2025-09-02 14:12:37.445258", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.501086", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.1945725679397583, "timestamp": "2025-09-02 14:12:37.503232", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.556339", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.1152871698141098, "timestamp": "2025-09-02 14:12:37.558606", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:37.611798", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.19419938325881958, "timestamp": "2025-09-02 14:12:37.617688", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:37.669643", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.18319827318191528, "timestamp": "2025-09-02 14:12:37.671880", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.724384", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.2837603688240051, "timestamp": "2025-09-02 14:12:37.726622", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.779392", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.18323467671871185, "timestamp": "2025-09-02 14:12:37.781667", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.836056", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.14709478616714478, "timestamp": "2025-09-02 14:12:37.841953", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.893653", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.2110474556684494, "timestamp": "2025-09-02 14:12:37.895952", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:37.948584", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.31441909074783325, "timestamp": "2025-09-02 14:12:37.950713", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:38.004335", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.11963314563035965, "timestamp": "2025-09-02 14:12:38.006729", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:38.060020", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.16965347528457642, "timestamp": "2025-09-02 14:12:38.065806", "step": 640, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:39.911009", "step": 640, "epoch": 1 }, { "type": "pplx", "content": 7926.589878277829, "timestamp": "2025-09-02 14:12:39.913231", "step": 640, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 640", "timestamp": "2025-09-02 14:12:40.288222", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:40.344329", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.11686678230762482, "timestamp": "2025-09-02 14:12:40.346204", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.399542", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.19584155082702637, "timestamp": "2025-09-02 14:12:40.401886", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:40.454820", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.2713710069656372, "timestamp": "2025-09-02 14:12:40.457216", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:40.511051", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.14807361364364624, "timestamp": "2025-09-02 14:12:40.517559", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.569849", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.14971205592155457, "timestamp": "2025-09-02 14:12:40.572191", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.625496", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.18060621619224548, "timestamp": "2025-09-02 14:12:40.627936", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.680485", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.2434072494506836, "timestamp": "2025-09-02 14:12:40.682483", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.735507", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.12928201258182526, "timestamp": "2025-09-02 14:12:40.741530", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.793447", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.19865606725215912, "timestamp": "2025-09-02 14:12:40.795797", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:40.849664", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.2602863907814026, "timestamp": "2025-09-02 14:12:40.852019", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:40.906118", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.24269771575927734, "timestamp": "2025-09-02 14:12:40.908492", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:40.961423", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.34464284777641296, "timestamp": "2025-09-02 14:12:40.967399", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.019412", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.14796023070812225, "timestamp": "2025-09-02 14:12:41.021712", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:41.075323", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.15530647337436676, "timestamp": "2025-09-02 14:12:41.077895", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.130752", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.3560507297515869, "timestamp": "2025-09-02 14:12:41.132721", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.185185", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.21356947720050812, "timestamp": "2025-09-02 14:12:41.191291", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:41.244806", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.17146934568881989, "timestamp": "2025-09-02 14:12:41.247353", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.300253", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.2701266407966614, "timestamp": "2025-09-02 14:12:41.302593", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.355293", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.16839765012264252, "timestamp": "2025-09-02 14:12:41.357615", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:41.410576", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.17055469751358032, "timestamp": "2025-09-02 14:12:41.416206", "step": 660, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:43.270916", "step": 660, "epoch": 1 }, { "type": "pplx", "content": 7885.159019174716, "timestamp": "2025-09-02 14:12:43.273597", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.325765", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.25655749440193176, "timestamp": "2025-09-02 14:12:43.328471", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:43.381527", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.23715965449810028, "timestamp": "2025-09-02 14:12:43.383953", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.436491", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.19870862364768982, "timestamp": "2025-09-02 14:12:43.438722", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.491160", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.29986172914505005, "timestamp": "2025-09-02 14:12:43.497408", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:43.549631", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.1733299046754837, "timestamp": "2025-09-02 14:12:43.551730", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:43.604479", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.20366215705871582, "timestamp": "2025-09-02 14:12:43.615447", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.670342", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.17279738187789917, "timestamp": "2025-09-02 14:12:43.672428", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.724490", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.16177161037921906, "timestamp": "2025-09-02 14:12:43.730548", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.783496", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.2511541545391083, "timestamp": "2025-09-02 14:12:43.785704", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.838727", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.18461602926254272, "timestamp": "2025-09-02 14:12:43.841098", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.893731", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.2800407409667969, "timestamp": "2025-09-02 14:12:43.895914", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:43.948134", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.21994325518608093, "timestamp": "2025-09-02 14:12:43.953992", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.005712", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.22242200374603271, "timestamp": "2025-09-02 14:12:44.007963", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.061183", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.249558687210083, "timestamp": "2025-09-02 14:12:44.063416", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.116286", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.21726886928081512, "timestamp": "2025-09-02 14:12:44.118585", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:44.171602", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.12145905941724777, "timestamp": "2025-09-02 14:12:44.177845", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.230361", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.2892654240131378, "timestamp": "2025-09-02 14:12:44.232597", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.286527", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.23332726955413818, "timestamp": "2025-09-02 14:12:44.288789", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:44.341301", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.13308443129062653, "timestamp": "2025-09-02 14:12:44.343407", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:44.396630", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.17103779315948486, "timestamp": "2025-09-02 14:12:44.402553", "step": 680, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:46.259047", "step": 680, "epoch": 1 }, { "type": "pplx", "content": 7906.30986041824, "timestamp": "2025-09-02 14:12:46.261115", "step": 680, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 680", "timestamp": "2025-09-02 14:12:46.627014", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:46.681991", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.15590651333332062, "timestamp": "2025-09-02 14:12:46.684151", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:46.738569", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.21466127038002014, "timestamp": "2025-09-02 14:12:46.740700", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:46.794062", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.47174811363220215, "timestamp": "2025-09-02 14:12:46.796257", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:46.849133", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.06468340009450912, "timestamp": "2025-09-02 14:12:46.855779", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:46.908798", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.14140036702156067, "timestamp": "2025-09-02 14:12:46.910968", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:46.964099", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.23245996236801147, "timestamp": "2025-09-02 14:12:46.966443", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.020575", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.23960299789905548, "timestamp": "2025-09-02 14:12:47.022726", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.075762", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.33759990334510803, "timestamp": "2025-09-02 14:12:47.081813", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.133731", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.19073475897312164, "timestamp": "2025-09-02 14:12:47.135981", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.188895", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.18564462661743164, "timestamp": "2025-09-02 14:12:47.191293", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.244326", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.20928145945072174, "timestamp": "2025-09-02 14:12:47.246816", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:47.299677", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.12734439969062805, "timestamp": "2025-09-02 14:12:47.305744", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.357853", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.30287760496139526, "timestamp": "2025-09-02 14:12:47.360699", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.413234", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.28186675906181335, "timestamp": "2025-09-02 14:12:47.415694", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.468330", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.3678439259529114, "timestamp": "2025-09-02 14:12:47.470567", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.525090", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.2649516761302948, "timestamp": "2025-09-02 14:12:47.530948", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.584369", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.19854772090911865, "timestamp": "2025-09-02 14:12:47.586617", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:47.639819", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.24503359198570251, "timestamp": "2025-09-02 14:12:47.642026", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.694191", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.1808222234249115, "timestamp": "2025-09-02 14:12:47.696586", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:47.749279", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.21590936183929443, "timestamp": "2025-09-02 14:12:47.754913", "step": 700, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:49.618362", "step": 700, "epoch": 1 }, { "type": "pplx", "content": 7912.522169075078, "timestamp": "2025-09-02 14:12:49.620426", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:49.674051", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.22633829712867737, "timestamp": "2025-09-02 14:12:49.676302", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:49.729721", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.08249829709529877, "timestamp": "2025-09-02 14:12:49.731875", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:49.784743", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.22881589829921722, "timestamp": "2025-09-02 14:12:49.786973", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:49.840605", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.16849926114082336, "timestamp": "2025-09-02 14:12:49.846736", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:49.898777", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.14494319260120392, "timestamp": "2025-09-02 14:12:49.900904", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:49.953432", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.1289961189031601, "timestamp": "2025-09-02 14:12:49.955873", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.008006", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.2857365310192108, "timestamp": "2025-09-02 14:12:50.011271", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:50.064214", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.2328745424747467, "timestamp": "2025-09-02 14:12:50.070337", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.122498", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.14068998396396637, "timestamp": "2025-09-02 14:12:50.124717", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:50.177573", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.22607575356960297, "timestamp": "2025-09-02 14:12:50.179694", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.232323", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.11389411240816116, "timestamp": "2025-09-02 14:12:50.234457", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.287237", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.13633550703525543, "timestamp": "2025-09-02 14:12:50.293140", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:50.345369", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.1671725958585739, "timestamp": "2025-09-02 14:12:50.347464", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.400014", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.11842828243970871, "timestamp": "2025-09-02 14:12:50.402534", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:50.455038", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.3006352186203003, "timestamp": "2025-09-02 14:12:50.456951", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.509095", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.20352716743946075, "timestamp": "2025-09-02 14:12:50.514590", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.567876", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.17974495887756348, "timestamp": "2025-09-02 14:12:50.570143", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:50.623158", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.13812130689620972, "timestamp": "2025-09-02 14:12:50.625261", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:50.678373", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.24547283351421356, "timestamp": "2025-09-02 14:12:50.680602", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:50.733227", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.12437926977872849, "timestamp": "2025-09-02 14:12:50.739012", "step": 720, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:52.589625", "step": 720, "epoch": 1 }, { "type": "pplx", "content": 7935.933168776482, "timestamp": "2025-09-02 14:12:52.591823", "step": 720, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 720", "timestamp": "2025-09-02 14:12:52.986654", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.042621", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.19048087298870087, "timestamp": "2025-09-02 14:12:53.044921", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.099543", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.11665996164083481, "timestamp": "2025-09-02 14:12:53.101861", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.154032", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.21225671470165253, "timestamp": "2025-09-02 14:12:53.155881", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:53.209046", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.18413183093070984, "timestamp": "2025-09-02 14:12:53.215275", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.267723", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.06077410653233528, "timestamp": "2025-09-02 14:12:53.269852", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.323958", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.23248250782489777, "timestamp": "2025-09-02 14:12:53.326067", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.380429", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.13881835341453552, "timestamp": "2025-09-02 14:12:53.382381", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.436617", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.12780538201332092, "timestamp": "2025-09-02 14:12:53.442537", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.496725", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.14719468355178833, "timestamp": "2025-09-02 14:12:53.498882", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.551390", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.1849883794784546, "timestamp": "2025-09-02 14:12:53.553330", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:53.607852", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.15184366703033447, "timestamp": "2025-09-02 14:12:53.610003", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.662326", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.23728883266448975, "timestamp": "2025-09-02 14:12:53.668039", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:53.719805", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.2514018416404724, "timestamp": "2025-09-02 14:12:53.722178", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.775135", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.12702736258506775, "timestamp": "2025-09-02 14:12:53.777242", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:53.830446", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.16683834791183472, "timestamp": "2025-09-02 14:12:53.832297", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.886591", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.1954077184200287, "timestamp": "2025-09-02 14:12:53.892235", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.943583", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.20041660964488983, "timestamp": "2025-09-02 14:12:53.945853", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:53.998938", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.21242669224739075, "timestamp": "2025-09-02 14:12:54.001244", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:54.055232", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.2484673112630844, "timestamp": "2025-09-02 14:12:54.057630", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:54.111947", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.19325634837150574, "timestamp": "2025-09-02 14:12:54.117813", "step": 740, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:55.965297", "step": 740, "epoch": 1 }, { "type": "pplx", "content": 7920.876259801516, "timestamp": "2025-09-02 14:12:55.967362", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.020263", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.12128989398479462, "timestamp": "2025-09-02 14:12:56.022596", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.076166", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.1290517896413803, "timestamp": "2025-09-02 14:12:56.078218", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.130873", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.2079210728406906, "timestamp": "2025-09-02 14:12:56.132710", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.184657", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.15682601928710938, "timestamp": "2025-09-02 14:12:56.190209", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.242219", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.1883786916732788, "timestamp": "2025-09-02 14:12:56.244773", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.298589", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.1591564565896988, "timestamp": "2025-09-02 14:12:56.300960", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.355754", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.19211481511592865, "timestamp": "2025-09-02 14:12:56.357915", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.417758", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.34546956419944763, "timestamp": "2025-09-02 14:12:56.424475", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.478969", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.14235959947109222, "timestamp": "2025-09-02 14:12:56.481186", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.535935", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.12163538485765457, "timestamp": "2025-09-02 14:12:56.538293", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:56.592762", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.21689672768115997, "timestamp": "2025-09-02 14:12:56.596046", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.649960", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.13091091811656952, "timestamp": "2025-09-02 14:12:56.655776", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:56.709388", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.3219466507434845, "timestamp": "2025-09-02 14:12:56.711640", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.765933", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.14897404611110687, "timestamp": "2025-09-02 14:12:56.767859", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.823396", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.14813624322414398, "timestamp": "2025-09-02 14:12:56.825663", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:56.881917", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.17322079837322235, "timestamp": "2025-09-02 14:12:56.888803", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:56.951398", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.2692295014858246, "timestamp": "2025-09-02 14:12:56.954067", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:57.009792", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.31910592317581177, "timestamp": "2025-09-02 14:12:57.012075", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:57.068649", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.1266038566827774, "timestamp": "2025-09-02 14:12:57.071188", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:57.129689", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.14064660668373108, "timestamp": "2025-09-02 14:12:57.137970", "step": 760, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:12:58.992216", "step": 760, "epoch": 1 }, { "type": "pplx", "content": 7816.3706500695325, "timestamp": "2025-09-02 14:12:58.994379", "step": 760, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 760", "timestamp": "2025-09-02 14:12:59.474370", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.530167", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.1624010056257248, "timestamp": "2025-09-02 14:12:59.532698", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.586028", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.24109973013401031, "timestamp": "2025-09-02 14:12:59.588739", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.641186", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.30950927734375, "timestamp": "2025-09-02 14:12:59.643371", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:12:59.696608", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.1335359662771225, "timestamp": "2025-09-02 14:12:59.702738", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:12:59.755043", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.16165265440940857, "timestamp": "2025-09-02 14:12:59.757058", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.809143", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.12370990961790085, "timestamp": "2025-09-02 14:12:59.811405", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.864381", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.1836068034172058, "timestamp": "2025-09-02 14:12:59.866640", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.918939", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.30099523067474365, "timestamp": "2025-09-02 14:12:59.924911", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:12:59.977305", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.24648365378379822, "timestamp": "2025-09-02 14:12:59.979380", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.032316", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.2697862684726715, "timestamp": "2025-09-02 14:13:00.035006", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.092616", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.14116349816322327, "timestamp": "2025-09-02 14:13:00.094846", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.154602", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.1452658772468567, "timestamp": "2025-09-02 14:13:00.160651", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.212740", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.16761545836925507, "timestamp": "2025-09-02 14:13:00.215227", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:00.269046", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.21478907763957977, "timestamp": "2025-09-02 14:13:00.271245", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.328064", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.2519703805446625, "timestamp": "2025-09-02 14:13:00.330263", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.396229", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.1852380931377411, "timestamp": "2025-09-02 14:13:00.401715", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:00.461734", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.1929105967283249, "timestamp": "2025-09-02 14:13:00.463763", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.521487", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.25451821088790894, "timestamp": "2025-09-02 14:13:00.523253", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:00.575257", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.21789592504501343, "timestamp": "2025-09-02 14:13:00.577362", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:00.636875", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.23432296514511108, "timestamp": "2025-09-02 14:13:00.649606", "step": 780, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:02.529336", "step": 780, "epoch": 1 }, { "type": "pplx", "content": 7739.340512228412, "timestamp": "2025-09-02 14:13:02.531689", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.584771", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.21955612301826477, "timestamp": "2025-09-02 14:13:02.586745", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:02.640790", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.24403071403503418, "timestamp": "2025-09-02 14:13:02.642761", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.695436", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.3050089180469513, "timestamp": "2025-09-02 14:13:02.697867", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.750365", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.10141702741384506, "timestamp": "2025-09-02 14:13:02.756222", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:02.810333", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.09701476991176605, "timestamp": "2025-09-02 14:13:02.812318", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.866340", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.12233109772205353, "timestamp": "2025-09-02 14:13:02.868477", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.923815", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.17463451623916626, "timestamp": "2025-09-02 14:13:02.925909", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:02.979202", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.1267501264810562, "timestamp": "2025-09-02 14:13:02.985657", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.038197", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.14238981902599335, "timestamp": "2025-09-02 14:13:03.040464", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.092939", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.23867711424827576, "timestamp": "2025-09-02 14:13:03.095372", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.148011", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.12874369323253632, "timestamp": "2025-09-02 14:13:03.150201", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:03.203857", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.18163302540779114, "timestamp": "2025-09-02 14:13:03.209861", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.262631", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.08785891532897949, "timestamp": "2025-09-02 14:13:03.264862", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:03.317406", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.3065173625946045, "timestamp": "2025-09-02 14:13:03.319135", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:03.371860", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.245545893907547, "timestamp": "2025-09-02 14:13:03.373904", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.428146", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.29581278562545776, "timestamp": "2025-09-02 14:13:03.434269", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.487142", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.17703501880168915, "timestamp": "2025-09-02 14:13:03.489351", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.542381", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.14578594267368317, "timestamp": "2025-09-02 14:13:03.544159", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:03.596881", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.12644097208976746, "timestamp": "2025-09-02 14:13:03.599165", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:03.652249", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.23014090955257416, "timestamp": "2025-09-02 14:13:03.658187", "step": 800, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:05.523603", "step": 800, "epoch": 1 }, { "type": "pplx", "content": 7770.708746561899, "timestamp": "2025-09-02 14:13:05.525652", "step": 800, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 800", "timestamp": "2025-09-02 14:13:05.901102", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:05.958465", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.13451027870178223, "timestamp": "2025-09-02 14:13:05.960751", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:06.016194", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.16393296420574188, "timestamp": "2025-09-02 14:13:06.018429", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:06.072783", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.3267008066177368, "timestamp": "2025-09-02 14:13:06.075019", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:06.128467", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.13317596912384033, "timestamp": "2025-09-02 14:13:06.134661", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.187549", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.2523692846298218, "timestamp": "2025-09-02 14:13:06.190126", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.244042", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.17904481291770935, "timestamp": "2025-09-02 14:13:06.246291", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:06.299652", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.2579326629638672, "timestamp": "2025-09-02 14:13:06.301906", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.354707", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.1636921614408493, "timestamp": "2025-09-02 14:13:06.360772", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.412804", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.17405761778354645, "timestamp": "2025-09-02 14:13:06.414967", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.470313", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.2610548436641693, "timestamp": "2025-09-02 14:13:06.472605", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.525425", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.08704736083745956, "timestamp": "2025-09-02 14:13:06.528102", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.581929", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.25421997904777527, "timestamp": "2025-09-02 14:13:06.588633", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.641920", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.23566123843193054, "timestamp": "2025-09-02 14:13:06.644016", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.696865", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.13563935458660126, "timestamp": "2025-09-02 14:13:06.699226", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.752618", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.20804756879806519, "timestamp": "2025-09-02 14:13:06.754938", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:06.808935", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.30100616812705994, "timestamp": "2025-09-02 14:13:06.815004", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:06.868058", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.2613675892353058, "timestamp": "2025-09-02 14:13:06.870210", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:06.923313", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.14258906245231628, "timestamp": "2025-09-02 14:13:06.925707", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:06.978266", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.22413280606269836, "timestamp": "2025-09-02 14:13:06.980382", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:07.033378", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.26451605558395386, "timestamp": "2025-09-02 14:13:07.039575", "step": 820, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:08.889958", "step": 820, "epoch": 1 }, { "type": "pplx", "content": 7807.370600839108, "timestamp": "2025-09-02 14:13:08.892140", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:08.944962", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.2115056961774826, "timestamp": "2025-09-02 14:13:08.947985", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.002279", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.20181508362293243, "timestamp": "2025-09-02 14:13:09.004473", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.057703", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.292702317237854, "timestamp": "2025-09-02 14:13:09.059960", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.112931", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.13345138728618622, "timestamp": "2025-09-02 14:13:09.119198", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.171955", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.0815330445766449, "timestamp": "2025-09-02 14:13:09.174351", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.227202", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.17190802097320557, "timestamp": "2025-09-02 14:13:09.229495", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.282138", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.15718881785869598, "timestamp": "2025-09-02 14:13:09.284363", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.337121", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.11583644151687622, "timestamp": "2025-09-02 14:13:09.343107", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.395707", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.4308925271034241, "timestamp": "2025-09-02 14:13:09.397973", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.450351", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.1544128805398941, "timestamp": "2025-09-02 14:13:09.452621", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.505310", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.20544810593128204, "timestamp": "2025-09-02 14:13:09.507685", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.560105", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.11472009867429733, "timestamp": "2025-09-02 14:13:09.566089", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:09.618334", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.19708462059497833, "timestamp": "2025-09-02 14:13:09.620526", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.672935", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.10617980360984802, "timestamp": "2025-09-02 14:13:09.675177", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.727748", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.300912469625473, "timestamp": "2025-09-02 14:13:09.730039", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.782838", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.17563775181770325, "timestamp": "2025-09-02 14:13:09.789077", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.841182", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.2332932949066162, "timestamp": "2025-09-02 14:13:09.843449", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:09.896526", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.18961255252361298, "timestamp": "2025-09-02 14:13:09.898724", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:09.951137", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.09531288594007492, "timestamp": "2025-09-02 14:13:09.953323", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:10.007081", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.2127091884613037, "timestamp": "2025-09-02 14:13:10.013477", "step": 840, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:11.866820", "step": 840, "epoch": 1 }, { "type": "pplx", "content": 7874.267780016884, "timestamp": "2025-09-02 14:13:11.869056", "step": 840, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 840", "timestamp": "2025-09-02 14:13:12.228963", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.287345", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.23928850889205933, "timestamp": "2025-09-02 14:13:12.289980", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.348419", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.32385653257369995, "timestamp": "2025-09-02 14:13:12.356866", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.413940", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.07635907828807831, "timestamp": "2025-09-02 14:13:12.418692", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.474205", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.22491808235645294, "timestamp": "2025-09-02 14:13:12.481942", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.536575", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.2157159298658371, "timestamp": "2025-09-02 14:13:12.547890", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.610924", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.19950488209724426, "timestamp": "2025-09-02 14:13:12.615783", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:12.671389", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.08629981428384781, "timestamp": "2025-09-02 14:13:12.674137", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.727330", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.18819572031497955, "timestamp": "2025-09-02 14:13:12.733590", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:12.786330", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.2759976089000702, "timestamp": "2025-09-02 14:13:12.795030", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.853954", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.16720882058143616, "timestamp": "2025-09-02 14:13:12.863262", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.920312", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.17698390781879425, "timestamp": "2025-09-02 14:13:12.929306", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:12.988652", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.20411069691181183, "timestamp": "2025-09-02 14:13:12.995223", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:13.050314", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.23517148196697235, "timestamp": "2025-09-02 14:13:13.053774", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:13.109781", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.2100260704755783, "timestamp": "2025-09-02 14:13:13.112850", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:13.167957", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.18422658741474152, "timestamp": "2025-09-02 14:13:13.172067", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:13.229583", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.3235742449760437, "timestamp": "2025-09-02 14:13:13.237858", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:13.290908", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.2023991048336029, "timestamp": "2025-09-02 14:13:13.299526", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:13.360336", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.1359989494085312, "timestamp": "2025-09-02 14:13:13.375939", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:13.443463", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.13767452538013458, "timestamp": "2025-09-02 14:13:13.460408", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:13.515244", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.17548757791519165, "timestamp": "2025-09-02 14:13:13.522169", "step": 860, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:15.411143", "step": 860, "epoch": 1 }, { "type": "pplx", "content": 7917.218243583269, "timestamp": "2025-09-02 14:13:15.413126", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:15.465927", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.26067256927490234, "timestamp": "2025-09-02 14:13:15.468045", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.521188", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.24135203659534454, "timestamp": "2025-09-02 14:13:15.523448", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.577207", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.20809251070022583, "timestamp": "2025-09-02 14:13:15.579539", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:15.633321", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.157253697514534, "timestamp": "2025-09-02 14:13:15.639606", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.693090", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.17173855006694794, "timestamp": "2025-09-02 14:13:15.695337", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.748040", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.20683398842811584, "timestamp": "2025-09-02 14:13:15.750319", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.803098", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.10126219689846039, "timestamp": "2025-09-02 14:13:15.805393", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.858532", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.17409837245941162, "timestamp": "2025-09-02 14:13:15.864530", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.925547", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.07787155359983444, "timestamp": "2025-09-02 14:13:15.928532", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:15.982289", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.18379290401935577, "timestamp": "2025-09-02 14:13:15.985173", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:16.039812", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.2374415099620819, "timestamp": "2025-09-02 14:13:16.042145", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.095407", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.2553151845932007, "timestamp": "2025-09-02 14:13:16.101507", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.155122", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.1868804544210434, "timestamp": "2025-09-02 14:13:16.156811", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.209407", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.1404283195734024, "timestamp": "2025-09-02 14:13:16.211848", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.265048", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.28834718465805054, "timestamp": "2025-09-02 14:13:16.267789", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.321187", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.18335530161857605, "timestamp": "2025-09-02 14:13:16.327434", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.379594", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.267606258392334, "timestamp": "2025-09-02 14:13:16.381897", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.435169", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.24636401236057281, "timestamp": "2025-09-02 14:13:16.437091", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.489684", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.15712353587150574, "timestamp": "2025-09-02 14:13:16.491626", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:16.543966", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.13192637264728546, "timestamp": "2025-09-02 14:13:16.550110", "step": 880, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:18.427640", "step": 880, "epoch": 1 }, { "type": "pplx", "content": 7965.472929269665, "timestamp": "2025-09-02 14:13:18.430114", "step": 880, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 880", "timestamp": "2025-09-02 14:13:18.795798", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:18.852883", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.1987415999174118, "timestamp": "2025-09-02 14:13:18.855129", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:18.909619", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.3409096598625183, "timestamp": "2025-09-02 14:13:18.912538", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:18.966482", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.20831014215946198, "timestamp": "2025-09-02 14:13:18.968720", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.021759", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.3011943995952606, "timestamp": "2025-09-02 14:13:19.028225", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.081004", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.33738425374031067, "timestamp": "2025-09-02 14:13:19.084598", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:19.137823", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.2369687706232071, "timestamp": "2025-09-02 14:13:19.139939", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:19.193320", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.19282115995883942, "timestamp": "2025-09-02 14:13:19.195642", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:19.248744", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.15914583206176758, "timestamp": "2025-09-02 14:13:19.254542", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:19.306808", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.12965087592601776, "timestamp": "2025-09-02 14:13:19.313893", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.369611", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.10052490234375, "timestamp": "2025-09-02 14:13:19.372008", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.424937", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.20652812719345093, "timestamp": "2025-09-02 14:13:19.427174", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.479371", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.10844855010509491, "timestamp": "2025-09-02 14:13:19.485479", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.537804", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.17090748250484467, "timestamp": "2025-09-02 14:13:19.540197", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:19.593809", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.1207038089632988, "timestamp": "2025-09-02 14:13:19.596246", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.649273", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.1838492602109909, "timestamp": "2025-09-02 14:13:19.651641", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.704634", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.29692086577415466, "timestamp": "2025-09-02 14:13:19.710795", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.763213", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.19614513218402863, "timestamp": "2025-09-02 14:13:19.765398", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.819060", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.245331808924675, "timestamp": "2025-09-02 14:13:19.820844", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:19.874135", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.05622559413313866, "timestamp": "2025-09-02 14:13:19.879063", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:19.935285", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.27839696407318115, "timestamp": "2025-09-02 14:13:19.941567", "step": 900, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:21.866103", "step": 900, "epoch": 1 }, { "type": "pplx", "content": 7962.8395584659, "timestamp": "2025-09-02 14:13:21.868230", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:21.920524", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.08369795233011246, "timestamp": "2025-09-02 14:13:21.922995", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:21.977164", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.18949854373931885, "timestamp": "2025-09-02 14:13:21.979628", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:22.032803", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.17946869134902954, "timestamp": "2025-09-02 14:13:22.035033", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.087743", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.17319419980049133, "timestamp": "2025-09-02 14:13:22.093811", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.145662", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.3057437837123871, "timestamp": "2025-09-02 14:13:22.147877", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:22.201085", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.2209518700838089, "timestamp": "2025-09-02 14:13:22.203159", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.256529", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.23834586143493652, "timestamp": "2025-09-02 14:13:22.259075", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.312141", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.1948746293783188, "timestamp": "2025-09-02 14:13:22.318102", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.371273", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.2133042961359024, "timestamp": "2025-09-02 14:13:22.373065", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:22.425354", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.12327545136213303, "timestamp": "2025-09-02 14:13:22.428123", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.480647", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.2584299147129059, "timestamp": "2025-09-02 14:13:22.482880", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.535392", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.16504642367362976, "timestamp": "2025-09-02 14:13:22.541116", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.592597", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.23312780261039734, "timestamp": "2025-09-02 14:13:22.594988", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:22.647794", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.23452647030353546, "timestamp": "2025-09-02 14:13:22.649911", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.702601", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.1741255819797516, "timestamp": "2025-09-02 14:13:22.705486", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.758470", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.12675516307353973, "timestamp": "2025-09-02 14:13:22.764616", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:13:22.817794", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.33339935541152954, "timestamp": "2025-09-02 14:13:22.820027", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:22.872580", "step": 917, "epoch": 1 }, { "type": "loss", "content": 0.14603400230407715, "timestamp": "2025-09-02 14:13:22.874815", "step": 918, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:22.928498", "step": 918, "epoch": 1 }, { "type": "loss", "content": 0.20546779036521912, "timestamp": "2025-09-02 14:13:22.930957", "step": 919, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:22.985588", "step": 919, "epoch": 1 }, { "type": "loss", "content": 0.15842491388320923, "timestamp": "2025-09-02 14:13:22.991476", "step": 920, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:24.882750", "step": 920, "epoch": 1 }, { "type": "pplx", "content": 7953.907273471604, "timestamp": "2025-09-02 14:13:24.884849", "step": 920, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 920", "timestamp": "2025-09-02 14:13:25.242358", "step": 920, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.297545", "step": 920, "epoch": 1 }, { "type": "loss", "content": 0.203263059258461, "timestamp": "2025-09-02 14:13:25.299974", "step": 921, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.354359", "step": 921, "epoch": 1 }, { "type": "loss", "content": 0.1361628621816635, "timestamp": "2025-09-02 14:13:25.356727", "step": 922, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:25.428545", "step": 922, "epoch": 1 }, { "type": "loss", "content": 0.16441944241523743, "timestamp": "2025-09-02 14:13:25.430876", "step": 923, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.483955", "step": 923, "epoch": 1 }, { "type": "loss", "content": 0.44225698709487915, "timestamp": "2025-09-02 14:13:25.489957", "step": 924, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:25.542526", "step": 924, "epoch": 1 }, { "type": "loss", "content": 0.42324432730674744, "timestamp": "2025-09-02 14:13:25.544665", "step": 925, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.597781", "step": 925, "epoch": 1 }, { "type": "loss", "content": 0.16994793713092804, "timestamp": "2025-09-02 14:13:25.600055", "step": 926, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.653046", "step": 926, "epoch": 1 }, { "type": "loss", "content": 0.2589470148086548, "timestamp": "2025-09-02 14:13:25.655314", "step": 927, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.708025", "step": 927, "epoch": 1 }, { "type": "loss", "content": 0.22528302669525146, "timestamp": "2025-09-02 14:13:25.713861", "step": 928, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:25.766069", "step": 928, "epoch": 1 }, { "type": "loss", "content": 0.14399774372577667, "timestamp": "2025-09-02 14:13:25.769754", "step": 929, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:25.823082", "step": 929, "epoch": 1 }, { "type": "loss", "content": 0.15566828846931458, "timestamp": "2025-09-02 14:13:25.825292", "step": 930, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:25.877792", "step": 930, "epoch": 1 }, { "type": "loss", "content": 0.16948439180850983, "timestamp": "2025-09-02 14:13:25.880079", "step": 931, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:25.933859", "step": 931, "epoch": 1 }, { "type": "loss", "content": 0.13956095278263092, "timestamp": "2025-09-02 14:13:25.939706", "step": 932, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:25.992435", "step": 932, "epoch": 1 }, { "type": "loss", "content": 0.2680423855781555, "timestamp": "2025-09-02 14:13:25.994713", "step": 933, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:26.047284", "step": 933, "epoch": 1 }, { "type": "loss", "content": 0.3114340007305145, "timestamp": "2025-09-02 14:13:26.049337", "step": 934, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:26.102102", "step": 934, "epoch": 1 }, { "type": "loss", "content": 0.21107253432273865, "timestamp": "2025-09-02 14:13:26.104386", "step": 935, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:26.157347", "step": 935, "epoch": 1 }, { "type": "loss", "content": 0.2989773452281952, "timestamp": "2025-09-02 14:13:26.163152", "step": 936, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:26.215658", "step": 936, "epoch": 1 }, { "type": "loss", "content": 0.21634040772914886, "timestamp": "2025-09-02 14:13:26.218262", "step": 937, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:26.271370", "step": 937, "epoch": 1 }, { "type": "loss", "content": 0.12267615646123886, "timestamp": "2025-09-02 14:13:26.273474", "step": 938, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:26.328373", "step": 938, "epoch": 1 }, { "type": "loss", "content": 0.2627514898777008, "timestamp": "2025-09-02 14:13:26.330601", "step": 939, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:26.383443", "step": 939, "epoch": 1 }, { "type": "loss", "content": 0.2750638723373413, "timestamp": "2025-09-02 14:13:26.389220", "step": 940, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:28.262742", "step": 940, "epoch": 1 }, { "type": "pplx", "content": 8021.083599765026, "timestamp": "2025-09-02 14:13:28.265472", "step": 940, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.317589", "step": 940, "epoch": 1 }, { "type": "loss", "content": 0.1284688264131546, "timestamp": "2025-09-02 14:13:28.319991", "step": 941, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:28.372985", "step": 941, "epoch": 1 }, { "type": "loss", "content": 0.2603037655353546, "timestamp": "2025-09-02 14:13:28.375191", "step": 942, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.428440", "step": 942, "epoch": 1 }, { "type": "loss", "content": 0.28176799416542053, "timestamp": "2025-09-02 14:13:28.430698", "step": 943, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.483270", "step": 943, "epoch": 1 }, { "type": "loss", "content": 0.07457881420850754, "timestamp": "2025-09-02 14:13:28.489597", "step": 944, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.541833", "step": 944, "epoch": 1 }, { "type": "loss", "content": 0.35749468207359314, "timestamp": "2025-09-02 14:13:28.544134", "step": 945, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.597020", "step": 945, "epoch": 1 }, { "type": "loss", "content": 0.22442792356014252, "timestamp": "2025-09-02 14:13:28.599344", "step": 946, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.653766", "step": 946, "epoch": 1 }, { "type": "loss", "content": 0.3261200785636902, "timestamp": "2025-09-02 14:13:28.655997", "step": 947, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:28.709423", "step": 947, "epoch": 1 }, { "type": "loss", "content": 0.33209145069122314, "timestamp": "2025-09-02 14:13:28.715474", "step": 948, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.768128", "step": 948, "epoch": 1 }, { "type": "loss", "content": 0.3142383396625519, "timestamp": "2025-09-02 14:13:28.770155", "step": 949, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:28.823220", "step": 949, "epoch": 1 }, { "type": "loss", "content": 0.14074745774269104, "timestamp": "2025-09-02 14:13:28.825549", "step": 950, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:28.878785", "step": 950, "epoch": 1 }, { "type": "loss", "content": 0.18730595707893372, "timestamp": "2025-09-02 14:13:28.880974", "step": 951, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:28.934065", "step": 951, "epoch": 1 }, { "type": "loss", "content": 0.19189099967479706, "timestamp": "2025-09-02 14:13:28.939832", "step": 952, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:28.992084", "step": 952, "epoch": 1 }, { "type": "loss", "content": 0.1285289078950882, "timestamp": "2025-09-02 14:13:28.994309", "step": 953, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.048181", "step": 953, "epoch": 1 }, { "type": "loss", "content": 0.23927120864391327, "timestamp": "2025-09-02 14:13:29.050340", "step": 954, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.102741", "step": 954, "epoch": 1 }, { "type": "loss", "content": 0.24538269639015198, "timestamp": "2025-09-02 14:13:29.105015", "step": 955, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.158759", "step": 955, "epoch": 1 }, { "type": "loss", "content": 0.23749010264873505, "timestamp": "2025-09-02 14:13:29.164622", "step": 956, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.216734", "step": 956, "epoch": 1 }, { "type": "loss", "content": 0.23358607292175293, "timestamp": "2025-09-02 14:13:29.218872", "step": 957, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.271926", "step": 957, "epoch": 1 }, { "type": "loss", "content": 0.33760857582092285, "timestamp": "2025-09-02 14:13:29.274174", "step": 958, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.326599", "step": 958, "epoch": 1 }, { "type": "loss", "content": 0.17051756381988525, "timestamp": "2025-09-02 14:13:29.328821", "step": 959, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:29.381251", "step": 959, "epoch": 1 }, { "type": "loss", "content": 0.4238995313644409, "timestamp": "2025-09-02 14:13:29.387065", "step": 960, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:31.251970", "step": 960, "epoch": 1 }, { "type": "pplx", "content": 7989.650551301516, "timestamp": "2025-09-02 14:13:31.253963", "step": 960, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 960", "timestamp": "2025-09-02 14:13:31.611971", "step": 960, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:31.666784", "step": 960, "epoch": 1 }, { "type": "loss", "content": 0.11755263060331345, "timestamp": "2025-09-02 14:13:31.669231", "step": 961, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:31.722485", "step": 961, "epoch": 1 }, { "type": "loss", "content": 0.1187932938337326, "timestamp": "2025-09-02 14:13:31.724694", "step": 962, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:31.777501", "step": 962, "epoch": 1 }, { "type": "loss", "content": 0.1394844353199005, "timestamp": "2025-09-02 14:13:31.779853", "step": 963, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:31.832400", "step": 963, "epoch": 1 }, { "type": "loss", "content": 0.16519120335578918, "timestamp": "2025-09-02 14:13:31.838660", "step": 964, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:31.892426", "step": 964, "epoch": 1 }, { "type": "loss", "content": 0.20419344305992126, "timestamp": "2025-09-02 14:13:31.894719", "step": 965, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:31.947831", "step": 965, "epoch": 1 }, { "type": "loss", "content": 0.17720942199230194, "timestamp": "2025-09-02 14:13:31.950254", "step": 966, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.003030", "step": 966, "epoch": 1 }, { "type": "loss", "content": 0.18962541222572327, "timestamp": "2025-09-02 14:13:32.005300", "step": 967, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.058958", "step": 967, "epoch": 1 }, { "type": "loss", "content": 0.2663617432117462, "timestamp": "2025-09-02 14:13:32.065018", "step": 968, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.117394", "step": 968, "epoch": 1 }, { "type": "loss", "content": 0.11602035909891129, "timestamp": "2025-09-02 14:13:32.119720", "step": 969, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.173461", "step": 969, "epoch": 1 }, { "type": "loss", "content": 0.16105511784553528, "timestamp": "2025-09-02 14:13:32.175719", "step": 970, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.229372", "step": 970, "epoch": 1 }, { "type": "loss", "content": 0.22709161043167114, "timestamp": "2025-09-02 14:13:32.231656", "step": 971, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:32.284340", "step": 971, "epoch": 1 }, { "type": "loss", "content": 0.19775280356407166, "timestamp": "2025-09-02 14:13:32.290055", "step": 972, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:32.341982", "step": 972, "epoch": 1 }, { "type": "loss", "content": 0.17027516663074493, "timestamp": "2025-09-02 14:13:32.344153", "step": 973, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.398206", "step": 973, "epoch": 1 }, { "type": "loss", "content": 0.1383872777223587, "timestamp": "2025-09-02 14:13:32.400406", "step": 974, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.453277", "step": 974, "epoch": 1 }, { "type": "loss", "content": 0.23177017271518707, "timestamp": "2025-09-02 14:13:32.455861", "step": 975, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.508497", "step": 975, "epoch": 1 }, { "type": "loss", "content": 0.19974973797798157, "timestamp": "2025-09-02 14:13:32.514529", "step": 976, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.566593", "step": 976, "epoch": 1 }, { "type": "loss", "content": 0.24568890035152435, "timestamp": "2025-09-02 14:13:32.568819", "step": 977, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.621237", "step": 977, "epoch": 1 }, { "type": "loss", "content": 0.17292994260787964, "timestamp": "2025-09-02 14:13:32.623364", "step": 978, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:32.677357", "step": 978, "epoch": 1 }, { "type": "loss", "content": 0.3181510865688324, "timestamp": "2025-09-02 14:13:32.679611", "step": 979, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:32.732385", "step": 979, "epoch": 1 }, { "type": "loss", "content": 0.16834627091884613, "timestamp": "2025-09-02 14:13:32.738392", "step": 980, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:34.624064", "step": 980, "epoch": 1 }, { "type": "pplx", "content": 7982.142017240361, "timestamp": "2025-09-02 14:13:34.626376", "step": 980, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:34.679229", "step": 980, "epoch": 1 }, { "type": "loss", "content": 0.17323628067970276, "timestamp": "2025-09-02 14:13:34.682934", "step": 981, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:34.736730", "step": 981, "epoch": 1 }, { "type": "loss", "content": 0.09927472472190857, "timestamp": "2025-09-02 14:13:34.739924", "step": 982, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:34.792644", "step": 982, "epoch": 1 }, { "type": "loss", "content": 0.17951565980911255, "timestamp": "2025-09-02 14:13:34.794857", "step": 983, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:34.848047", "step": 983, "epoch": 1 }, { "type": "loss", "content": 0.2816520035266876, "timestamp": "2025-09-02 14:13:34.854555", "step": 984, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:34.906214", "step": 984, "epoch": 1 }, { "type": "loss", "content": 0.27632540464401245, "timestamp": "2025-09-02 14:13:34.908624", "step": 985, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:34.961797", "step": 985, "epoch": 1 }, { "type": "loss", "content": 0.21069876849651337, "timestamp": "2025-09-02 14:13:34.963988", "step": 986, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.017944", "step": 986, "epoch": 1 }, { "type": "loss", "content": 0.19941924512386322, "timestamp": "2025-09-02 14:13:35.020209", "step": 987, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.074522", "step": 987, "epoch": 1 }, { "type": "loss", "content": 0.17596979439258575, "timestamp": "2025-09-02 14:13:35.080432", "step": 988, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.133059", "step": 988, "epoch": 1 }, { "type": "loss", "content": 0.21535572409629822, "timestamp": "2025-09-02 14:13:35.135358", "step": 989, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.188192", "step": 989, "epoch": 1 }, { "type": "loss", "content": 0.15577475726604462, "timestamp": "2025-09-02 14:13:35.191763", "step": 990, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.244985", "step": 990, "epoch": 1 }, { "type": "loss", "content": 0.28968891501426697, "timestamp": "2025-09-02 14:13:35.247959", "step": 991, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.301581", "step": 991, "epoch": 1 }, { "type": "loss", "content": 0.22587871551513672, "timestamp": "2025-09-02 14:13:35.308355", "step": 992, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.361261", "step": 992, "epoch": 1 }, { "type": "loss", "content": 0.15957124531269073, "timestamp": "2025-09-02 14:13:35.363623", "step": 993, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.416092", "step": 993, "epoch": 1 }, { "type": "loss", "content": 0.14073289930820465, "timestamp": "2025-09-02 14:13:35.418087", "step": 994, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.470388", "step": 994, "epoch": 1 }, { "type": "loss", "content": 0.2885810136795044, "timestamp": "2025-09-02 14:13:35.472738", "step": 995, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.525291", "step": 995, "epoch": 1 }, { "type": "loss", "content": 0.21816349029541016, "timestamp": "2025-09-02 14:13:35.531168", "step": 996, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.584619", "step": 996, "epoch": 1 }, { "type": "loss", "content": 0.3184717297554016, "timestamp": "2025-09-02 14:13:35.586872", "step": 997, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.640610", "step": 997, "epoch": 1 }, { "type": "loss", "content": 0.36991363763809204, "timestamp": "2025-09-02 14:13:35.642861", "step": 998, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.695468", "step": 998, "epoch": 1 }, { "type": "loss", "content": 0.2254640758037567, "timestamp": "2025-09-02 14:13:35.697795", "step": 999, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:35.750921", "step": 999, "epoch": 1 }, { "type": "loss", "content": 0.2222105711698532, "timestamp": "2025-09-02 14:13:35.756705", "step": 1000, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:37.640042", "step": 1000, "epoch": 1 }, { "type": "pplx", "content": 8024.749503460549, "timestamp": "2025-09-02 14:13:37.642166", "step": 1000, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-02 14:13:38.002866", "step": 1000, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:38.056043", "step": 1000, "epoch": 1 }, { "type": "loss", "content": 0.14901156723499298, "timestamp": "2025-09-02 14:13:38.058141", "step": 1001, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.111118", "step": 1001, "epoch": 1 }, { "type": "loss", "content": 0.19966204464435577, "timestamp": "2025-09-02 14:13:38.113256", "step": 1002, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.165963", "step": 1002, "epoch": 1 }, { "type": "loss", "content": 0.1647142767906189, "timestamp": "2025-09-02 14:13:38.168038", "step": 1003, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.221087", "step": 1003, "epoch": 1 }, { "type": "loss", "content": 0.18640556931495667, "timestamp": "2025-09-02 14:13:38.227117", "step": 1004, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.279466", "step": 1004, "epoch": 1 }, { "type": "loss", "content": 0.15088747441768646, "timestamp": "2025-09-02 14:13:38.281508", "step": 1005, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.334042", "step": 1005, "epoch": 1 }, { "type": "loss", "content": 0.1464051753282547, "timestamp": "2025-09-02 14:13:38.336367", "step": 1006, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:38.389945", "step": 1006, "epoch": 1 }, { "type": "loss", "content": 0.133381187915802, "timestamp": "2025-09-02 14:13:38.392181", "step": 1007, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.445253", "step": 1007, "epoch": 1 }, { "type": "loss", "content": 0.3176492154598236, "timestamp": "2025-09-02 14:13:38.451179", "step": 1008, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.503720", "step": 1008, "epoch": 1 }, { "type": "loss", "content": 0.3005720376968384, "timestamp": "2025-09-02 14:13:38.506152", "step": 1009, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.559236", "step": 1009, "epoch": 1 }, { "type": "loss", "content": 0.21351377665996552, "timestamp": "2025-09-02 14:13:38.561501", "step": 1010, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.614637", "step": 1010, "epoch": 1 }, { "type": "loss", "content": 0.20070908963680267, "timestamp": "2025-09-02 14:13:38.617288", "step": 1011, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:38.670744", "step": 1011, "epoch": 1 }, { "type": "loss", "content": 0.18198685348033905, "timestamp": "2025-09-02 14:13:38.676614", "step": 1012, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.729421", "step": 1012, "epoch": 1 }, { "type": "loss", "content": 0.1708039790391922, "timestamp": "2025-09-02 14:13:38.731512", "step": 1013, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.786038", "step": 1013, "epoch": 1 }, { "type": "loss", "content": 0.10762366652488708, "timestamp": "2025-09-02 14:13:38.788563", "step": 1014, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:38.841344", "step": 1014, "epoch": 1 }, { "type": "loss", "content": 0.19667816162109375, "timestamp": "2025-09-02 14:13:38.843532", "step": 1015, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.896512", "step": 1015, "epoch": 1 }, { "type": "loss", "content": 0.1312372088432312, "timestamp": "2025-09-02 14:13:38.902243", "step": 1016, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:38.954535", "step": 1016, "epoch": 1 }, { "type": "loss", "content": 0.13296379148960114, "timestamp": "2025-09-02 14:13:38.956595", "step": 1017, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:39.009261", "step": 1017, "epoch": 1 }, { "type": "loss", "content": 0.2268930971622467, "timestamp": "2025-09-02 14:13:39.011347", "step": 1018, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:39.064597", "step": 1018, "epoch": 1 }, { "type": "loss", "content": 0.2496548444032669, "timestamp": "2025-09-02 14:13:39.066983", "step": 1019, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:39.120397", "step": 1019, "epoch": 1 }, { "type": "loss", "content": 0.17281349003314972, "timestamp": "2025-09-02 14:13:39.126280", "step": 1020, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:40.994772", "step": 1020, "epoch": 1 }, { "type": "pplx", "content": 8074.00550704583, "timestamp": "2025-09-02 14:13:40.997282", "step": 1020, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:41.051409", "step": 1020, "epoch": 1 }, { "type": "loss", "content": 0.2731595039367676, "timestamp": "2025-09-02 14:13:41.053630", "step": 1021, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.106322", "step": 1021, "epoch": 1 }, { "type": "loss", "content": 0.27621641755104065, "timestamp": "2025-09-02 14:13:41.108534", "step": 1022, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.160782", "step": 1022, "epoch": 1 }, { "type": "loss", "content": 0.23559372127056122, "timestamp": "2025-09-02 14:13:41.164312", "step": 1023, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.216842", "step": 1023, "epoch": 1 }, { "type": "loss", "content": 0.18078866600990295, "timestamp": "2025-09-02 14:13:41.222918", "step": 1024, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.274913", "step": 1024, "epoch": 1 }, { "type": "loss", "content": 0.21523979306221008, "timestamp": "2025-09-02 14:13:41.276639", "step": 1025, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.328761", "step": 1025, "epoch": 1 }, { "type": "loss", "content": 0.27395179867744446, "timestamp": "2025-09-02 14:13:41.330719", "step": 1026, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.383409", "step": 1026, "epoch": 1 }, { "type": "loss", "content": 0.1637527346611023, "timestamp": "2025-09-02 14:13:41.385623", "step": 1027, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.440276", "step": 1027, "epoch": 1 }, { "type": "loss", "content": 0.18228325247764587, "timestamp": "2025-09-02 14:13:41.446528", "step": 1028, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:41.498582", "step": 1028, "epoch": 1 }, { "type": "loss", "content": 0.2157241553068161, "timestamp": "2025-09-02 14:13:41.500793", "step": 1029, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.553482", "step": 1029, "epoch": 1 }, { "type": "loss", "content": 0.2327858805656433, "timestamp": "2025-09-02 14:13:41.555725", "step": 1030, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.608072", "step": 1030, "epoch": 1 }, { "type": "loss", "content": 0.12871715426445007, "timestamp": "2025-09-02 14:13:41.610435", "step": 1031, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.663414", "step": 1031, "epoch": 1 }, { "type": "loss", "content": 0.18858003616333008, "timestamp": "2025-09-02 14:13:41.669226", "step": 1032, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.721802", "step": 1032, "epoch": 1 }, { "type": "loss", "content": 0.21019500494003296, "timestamp": "2025-09-02 14:13:41.724197", "step": 1033, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:41.777097", "step": 1033, "epoch": 1 }, { "type": "loss", "content": 0.14526338875293732, "timestamp": "2025-09-02 14:13:41.779183", "step": 1034, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.841287", "step": 1034, "epoch": 1 }, { "type": "loss", "content": 0.17281746864318848, "timestamp": "2025-09-02 14:13:41.843021", "step": 1035, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:41.895083", "step": 1035, "epoch": 1 }, { "type": "loss", "content": 0.19006268680095673, "timestamp": "2025-09-02 14:13:41.900478", "step": 1036, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:41.952574", "step": 1036, "epoch": 1 }, { "type": "loss", "content": 0.1842532455921173, "timestamp": "2025-09-02 14:13:41.954715", "step": 1037, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:42.007288", "step": 1037, "epoch": 1 }, { "type": "loss", "content": 0.1499043107032776, "timestamp": "2025-09-02 14:13:42.009410", "step": 1038, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:42.062378", "step": 1038, "epoch": 1 }, { "type": "loss", "content": 0.2643926739692688, "timestamp": "2025-09-02 14:13:42.064378", "step": 1039, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:42.116952", "step": 1039, "epoch": 1 }, { "type": "loss", "content": 0.11790357530117035, "timestamp": "2025-09-02 14:13:42.123284", "step": 1040, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:44.014438", "step": 1040, "epoch": 1 }, { "type": "pplx", "content": 8241.098372075707, "timestamp": "2025-09-02 14:13:44.016518", "step": 1040, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1040", "timestamp": "2025-09-02 14:13:44.386921", "step": 1040, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.445328", "step": 1040, "epoch": 1 }, { "type": "loss", "content": 0.2807217240333557, "timestamp": "2025-09-02 14:13:44.447719", "step": 1041, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:44.503106", "step": 1041, "epoch": 1 }, { "type": "loss", "content": 0.30823469161987305, "timestamp": "2025-09-02 14:13:44.506879", "step": 1042, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.560750", "step": 1042, "epoch": 1 }, { "type": "loss", "content": 0.16432715952396393, "timestamp": "2025-09-02 14:13:44.563118", "step": 1043, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:44.617314", "step": 1043, "epoch": 1 }, { "type": "loss", "content": 0.2478114515542984, "timestamp": "2025-09-02 14:13:44.623696", "step": 1044, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.676637", "step": 1044, "epoch": 1 }, { "type": "loss", "content": 0.28707414865493774, "timestamp": "2025-09-02 14:13:44.679239", "step": 1045, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.732169", "step": 1045, "epoch": 1 }, { "type": "loss", "content": 0.2376083880662918, "timestamp": "2025-09-02 14:13:44.734710", "step": 1046, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.788082", "step": 1046, "epoch": 1 }, { "type": "loss", "content": 0.12870855629444122, "timestamp": "2025-09-02 14:13:44.790013", "step": 1047, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:44.843222", "step": 1047, "epoch": 1 }, { "type": "loss", "content": 0.20600318908691406, "timestamp": "2025-09-02 14:13:44.849429", "step": 1048, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.902166", "step": 1048, "epoch": 1 }, { "type": "loss", "content": 0.15754878520965576, "timestamp": "2025-09-02 14:13:44.904253", "step": 1049, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:44.958847", "step": 1049, "epoch": 1 }, { "type": "loss", "content": 0.1782093346118927, "timestamp": "2025-09-02 14:13:44.961916", "step": 1050, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.015606", "step": 1050, "epoch": 1 }, { "type": "loss", "content": 0.22241336107254028, "timestamp": "2025-09-02 14:13:45.018021", "step": 1051, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:45.071712", "step": 1051, "epoch": 1 }, { "type": "loss", "content": 0.21097983419895172, "timestamp": "2025-09-02 14:13:45.077825", "step": 1052, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:45.130273", "step": 1052, "epoch": 1 }, { "type": "loss", "content": 0.17279930412769318, "timestamp": "2025-09-02 14:13:45.132672", "step": 1053, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.186315", "step": 1053, "epoch": 1 }, { "type": "loss", "content": 0.33242619037628174, "timestamp": "2025-09-02 14:13:45.188844", "step": 1054, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.242934", "step": 1054, "epoch": 1 }, { "type": "loss", "content": 0.2733869254589081, "timestamp": "2025-09-02 14:13:45.245333", "step": 1055, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.299653", "step": 1055, "epoch": 1 }, { "type": "loss", "content": 0.2696478068828583, "timestamp": "2025-09-02 14:13:45.305841", "step": 1056, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.358624", "step": 1056, "epoch": 1 }, { "type": "loss", "content": 0.2785112261772156, "timestamp": "2025-09-02 14:13:45.360890", "step": 1057, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:45.414087", "step": 1057, "epoch": 1 }, { "type": "loss", "content": 0.2210589498281479, "timestamp": "2025-09-02 14:13:45.417757", "step": 1058, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.471186", "step": 1058, "epoch": 1 }, { "type": "loss", "content": 0.1760336309671402, "timestamp": "2025-09-02 14:13:45.474186", "step": 1059, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:45.528115", "step": 1059, "epoch": 1 }, { "type": "loss", "content": 0.0851527601480484, "timestamp": "2025-09-02 14:13:45.533911", "step": 1060, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:48.092271", "step": 1060, "epoch": 1 }, { "type": "pplx", "content": 8341.43591942185, "timestamp": "2025-09-02 14:13:48.094097", "step": 1060, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.165011", "step": 1060, "epoch": 1 }, { "type": "loss", "content": 0.14682649075984955, "timestamp": "2025-09-02 14:13:48.168608", "step": 1061, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:48.249398", "step": 1061, "epoch": 1 }, { "type": "loss", "content": 0.4056940972805023, "timestamp": "2025-09-02 14:13:48.251837", "step": 1062, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.313585", "step": 1062, "epoch": 1 }, { "type": "loss", "content": 0.20123642683029175, "timestamp": "2025-09-02 14:13:48.315907", "step": 1063, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.372937", "step": 1063, "epoch": 1 }, { "type": "loss", "content": 0.30507856607437134, "timestamp": "2025-09-02 14:13:48.378941", "step": 1064, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:48.440775", "step": 1064, "epoch": 1 }, { "type": "loss", "content": 0.17818179726600647, "timestamp": "2025-09-02 14:13:48.443247", "step": 1065, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:48.517630", "step": 1065, "epoch": 1 }, { "type": "loss", "content": 0.20798726379871368, "timestamp": "2025-09-02 14:13:48.519905", "step": 1066, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:48.605649", "step": 1066, "epoch": 1 }, { "type": "loss", "content": 0.21817444264888763, "timestamp": "2025-09-02 14:13:48.607605", "step": 1067, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.697824", "step": 1067, "epoch": 1 }, { "type": "loss", "content": 0.15860521793365479, "timestamp": "2025-09-02 14:13:48.704237", "step": 1068, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.773468", "step": 1068, "epoch": 1 }, { "type": "loss", "content": 0.22968651354312897, "timestamp": "2025-09-02 14:13:48.775504", "step": 1069, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.849799", "step": 1069, "epoch": 1 }, { "type": "loss", "content": 0.3198372423648834, "timestamp": "2025-09-02 14:13:48.851963", "step": 1070, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.919617", "step": 1070, "epoch": 1 }, { "type": "loss", "content": 0.10353555530309677, "timestamp": "2025-09-02 14:13:48.921713", "step": 1071, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:48.989531", "step": 1071, "epoch": 1 }, { "type": "loss", "content": 0.10393117368221283, "timestamp": "2025-09-02 14:13:48.995287", "step": 1072, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.063085", "step": 1072, "epoch": 1 }, { "type": "loss", "content": 0.315741628408432, "timestamp": "2025-09-02 14:13:49.066126", "step": 1073, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.127428", "step": 1073, "epoch": 1 }, { "type": "loss", "content": 0.28502997756004333, "timestamp": "2025-09-02 14:13:49.129764", "step": 1074, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:49.183915", "step": 1074, "epoch": 1 }, { "type": "loss", "content": 0.3344189524650574, "timestamp": "2025-09-02 14:13:49.186283", "step": 1075, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.241662", "step": 1075, "epoch": 1 }, { "type": "loss", "content": 0.08950699865818024, "timestamp": "2025-09-02 14:13:49.247797", "step": 1076, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.302151", "step": 1076, "epoch": 1 }, { "type": "loss", "content": 0.1999254673719406, "timestamp": "2025-09-02 14:13:49.305065", "step": 1077, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.360047", "step": 1077, "epoch": 1 }, { "type": "loss", "content": 0.20974458754062653, "timestamp": "2025-09-02 14:13:49.362231", "step": 1078, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.421783", "step": 1078, "epoch": 1 }, { "type": "loss", "content": 0.2947329580783844, "timestamp": "2025-09-02 14:13:49.424125", "step": 1079, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:49.488668", "step": 1079, "epoch": 1 }, { "type": "loss", "content": 0.1487163007259369, "timestamp": "2025-09-02 14:13:49.494354", "step": 1080, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:51.398569", "step": 1080, "epoch": 1 }, { "type": "pplx", "content": 8295.549915393258, "timestamp": "2025-09-02 14:13:51.400798", "step": 1080, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1080", "timestamp": "2025-09-02 14:13:51.760222", "step": 1080, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:51.817391", "step": 1080, "epoch": 1 }, { "type": "loss", "content": 0.23408669233322144, "timestamp": "2025-09-02 14:13:51.819386", "step": 1081, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:51.874505", "step": 1081, "epoch": 1 }, { "type": "loss", "content": 0.16290736198425293, "timestamp": "2025-09-02 14:13:51.876875", "step": 1082, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:51.931158", "step": 1082, "epoch": 1 }, { "type": "loss", "content": 0.24600014090538025, "timestamp": "2025-09-02 14:13:51.933446", "step": 1083, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:51.987760", "step": 1083, "epoch": 1 }, { "type": "loss", "content": 0.15879462659358978, "timestamp": "2025-09-02 14:13:51.994030", "step": 1084, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.046763", "step": 1084, "epoch": 1 }, { "type": "loss", "content": 0.21940895915031433, "timestamp": "2025-09-02 14:13:52.049095", "step": 1085, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:52.102513", "step": 1085, "epoch": 1 }, { "type": "loss", "content": 0.1802186518907547, "timestamp": "2025-09-02 14:13:52.104705", "step": 1086, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.157686", "step": 1086, "epoch": 1 }, { "type": "loss", "content": 0.08884047716856003, "timestamp": "2025-09-02 14:13:52.160089", "step": 1087, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.212592", "step": 1087, "epoch": 1 }, { "type": "loss", "content": 0.23544101417064667, "timestamp": "2025-09-02 14:13:52.218745", "step": 1088, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:52.271678", "step": 1088, "epoch": 1 }, { "type": "loss", "content": 0.2118818759918213, "timestamp": "2025-09-02 14:13:52.273774", "step": 1089, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.328787", "step": 1089, "epoch": 1 }, { "type": "loss", "content": 0.09115579724311829, "timestamp": "2025-09-02 14:13:52.331081", "step": 1090, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:52.385071", "step": 1090, "epoch": 1 }, { "type": "loss", "content": 0.25368037819862366, "timestamp": "2025-09-02 14:13:52.387466", "step": 1091, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.441022", "step": 1091, "epoch": 1 }, { "type": "loss", "content": 0.12354286760091782, "timestamp": "2025-09-02 14:13:52.447183", "step": 1092, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.499662", "step": 1092, "epoch": 1 }, { "type": "loss", "content": 0.20813098549842834, "timestamp": "2025-09-02 14:13:52.501849", "step": 1093, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.554556", "step": 1093, "epoch": 1 }, { "type": "loss", "content": 0.22072681784629822, "timestamp": "2025-09-02 14:13:52.556885", "step": 1094, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.610252", "step": 1094, "epoch": 1 }, { "type": "loss", "content": 0.15782783925533295, "timestamp": "2025-09-02 14:13:52.612958", "step": 1095, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:52.666345", "step": 1095, "epoch": 1 }, { "type": "loss", "content": 0.30868715047836304, "timestamp": "2025-09-02 14:13:52.672506", "step": 1096, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.724586", "step": 1096, "epoch": 1 }, { "type": "loss", "content": 0.08992698788642883, "timestamp": "2025-09-02 14:13:52.726525", "step": 1097, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.779271", "step": 1097, "epoch": 1 }, { "type": "loss", "content": 0.0930643305182457, "timestamp": "2025-09-02 14:13:52.781591", "step": 1098, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:52.834546", "step": 1098, "epoch": 1 }, { "type": "loss", "content": 0.16849565505981445, "timestamp": "2025-09-02 14:13:52.836792", "step": 1099, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:52.890374", "step": 1099, "epoch": 1 }, { "type": "loss", "content": 0.22978338599205017, "timestamp": "2025-09-02 14:13:52.896235", "step": 1100, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:54.803177", "step": 1100, "epoch": 1 }, { "type": "pplx", "content": 8352.057406785814, "timestamp": "2025-09-02 14:13:54.805455", "step": 1100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:54.858067", "step": 1100, "epoch": 1 }, { "type": "loss", "content": 0.15095049142837524, "timestamp": "2025-09-02 14:13:54.860265", "step": 1101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:54.914544", "step": 1101, "epoch": 1 }, { "type": "loss", "content": 0.18399545550346375, "timestamp": "2025-09-02 14:13:54.916954", "step": 1102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:54.972367", "step": 1102, "epoch": 1 }, { "type": "loss", "content": 0.1274794489145279, "timestamp": "2025-09-02 14:13:54.974713", "step": 1103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.029891", "step": 1103, "epoch": 1 }, { "type": "loss", "content": 0.13304054737091064, "timestamp": "2025-09-02 14:13:55.035915", "step": 1104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.107099", "step": 1104, "epoch": 1 }, { "type": "loss", "content": 0.22677811980247498, "timestamp": "2025-09-02 14:13:55.109418", "step": 1105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:55.177489", "step": 1105, "epoch": 1 }, { "type": "loss", "content": 0.13629919290542603, "timestamp": "2025-09-02 14:13:55.179886", "step": 1106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.234397", "step": 1106, "epoch": 1 }, { "type": "loss", "content": 0.2791527211666107, "timestamp": "2025-09-02 14:13:55.236875", "step": 1107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.299485", "step": 1107, "epoch": 1 }, { "type": "loss", "content": 0.1537906676530838, "timestamp": "2025-09-02 14:13:55.305610", "step": 1108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.369947", "step": 1108, "epoch": 1 }, { "type": "loss", "content": 0.2401839643716812, "timestamp": "2025-09-02 14:13:55.372735", "step": 1109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.450691", "step": 1109, "epoch": 1 }, { "type": "loss", "content": 0.19901221990585327, "timestamp": "2025-09-02 14:13:55.453257", "step": 1110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:55.526238", "step": 1110, "epoch": 1 }, { "type": "loss", "content": 0.2877916395664215, "timestamp": "2025-09-02 14:13:55.528855", "step": 1111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:55.584215", "step": 1111, "epoch": 1 }, { "type": "loss", "content": 0.11486170440912247, "timestamp": "2025-09-02 14:13:55.590369", "step": 1112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.643170", "step": 1112, "epoch": 1 }, { "type": "loss", "content": 0.35099852085113525, "timestamp": "2025-09-02 14:13:55.645261", "step": 1113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.699927", "step": 1113, "epoch": 1 }, { "type": "loss", "content": 0.2744300365447998, "timestamp": "2025-09-02 14:13:55.702040", "step": 1114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.755703", "step": 1114, "epoch": 1 }, { "type": "loss", "content": 0.18548500537872314, "timestamp": "2025-09-02 14:13:55.757925", "step": 1115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.811524", "step": 1115, "epoch": 1 }, { "type": "loss", "content": 0.39037665724754333, "timestamp": "2025-09-02 14:13:55.817461", "step": 1116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.870231", "step": 1116, "epoch": 1 }, { "type": "loss", "content": 0.15352550148963928, "timestamp": "2025-09-02 14:13:55.872583", "step": 1117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.926082", "step": 1117, "epoch": 1 }, { "type": "loss", "content": 0.18414539098739624, "timestamp": "2025-09-02 14:13:55.929829", "step": 1118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:55.983305", "step": 1118, "epoch": 1 }, { "type": "loss", "content": 0.18027164041996002, "timestamp": "2025-09-02 14:13:55.985657", "step": 1119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:56.038766", "step": 1119, "epoch": 1 }, { "type": "loss", "content": 0.11202987283468246, "timestamp": "2025-09-02 14:13:56.044926", "step": 1120, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:13:57.937818", "step": 1120, "epoch": 1 }, { "type": "pplx", "content": 8531.03563087373, "timestamp": "2025-09-02 14:13:57.940007", "step": 1120, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1120", "timestamp": "2025-09-02 14:13:58.314969", "step": 1120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:58.371057", "step": 1120, "epoch": 1 }, { "type": "loss", "content": 0.15444879233837128, "timestamp": "2025-09-02 14:13:58.373319", "step": 1121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:58.428861", "step": 1121, "epoch": 1 }, { "type": "loss", "content": 0.14769889414310455, "timestamp": "2025-09-02 14:13:58.431064", "step": 1122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.484839", "step": 1122, "epoch": 1 }, { "type": "loss", "content": 0.3150542378425598, "timestamp": "2025-09-02 14:13:58.487173", "step": 1123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:58.540926", "step": 1123, "epoch": 1 }, { "type": "loss", "content": 0.21344295144081116, "timestamp": "2025-09-02 14:13:58.547233", "step": 1124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.600051", "step": 1124, "epoch": 1 }, { "type": "loss", "content": 0.20581604540348053, "timestamp": "2025-09-02 14:13:58.601872", "step": 1125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.656558", "step": 1125, "epoch": 1 }, { "type": "loss", "content": 0.27154824137687683, "timestamp": "2025-09-02 14:13:58.658158", "step": 1126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.711428", "step": 1126, "epoch": 1 }, { "type": "loss", "content": 0.2379152625799179, "timestamp": "2025-09-02 14:13:58.713309", "step": 1127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.768277", "step": 1127, "epoch": 1 }, { "type": "loss", "content": 0.1269916445016861, "timestamp": "2025-09-02 14:13:58.774225", "step": 1128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:58.828812", "step": 1128, "epoch": 1 }, { "type": "loss", "content": 0.187445268034935, "timestamp": "2025-09-02 14:13:58.831084", "step": 1129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.884044", "step": 1129, "epoch": 1 }, { "type": "loss", "content": 0.22557400166988373, "timestamp": "2025-09-02 14:13:58.886357", "step": 1130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.941173", "step": 1130, "epoch": 1 }, { "type": "loss", "content": 0.11784407496452332, "timestamp": "2025-09-02 14:13:58.944113", "step": 1131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:58.997629", "step": 1131, "epoch": 1 }, { "type": "loss", "content": 0.12824895977973938, "timestamp": "2025-09-02 14:13:59.003807", "step": 1132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:59.057830", "step": 1132, "epoch": 1 }, { "type": "loss", "content": 0.06460969150066376, "timestamp": "2025-09-02 14:13:59.059802", "step": 1133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:59.116468", "step": 1133, "epoch": 1 }, { "type": "loss", "content": 0.06170053407549858, "timestamp": "2025-09-02 14:13:59.118286", "step": 1134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:59.178253", "step": 1134, "epoch": 1 }, { "type": "loss", "content": 0.31876593828201294, "timestamp": "2025-09-02 14:13:59.180261", "step": 1135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:13:59.241001", "step": 1135, "epoch": 1 }, { "type": "loss", "content": 0.2052164077758789, "timestamp": "2025-09-02 14:13:59.247457", "step": 1136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:59.300524", "step": 1136, "epoch": 1 }, { "type": "loss", "content": 0.09902020543813705, "timestamp": "2025-09-02 14:13:59.303811", "step": 1137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:59.361771", "step": 1137, "epoch": 1 }, { "type": "loss", "content": 0.1332462728023529, "timestamp": "2025-09-02 14:13:59.367840", "step": 1138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:13:59.422203", "step": 1138, "epoch": 1 }, { "type": "loss", "content": 0.1512727439403534, "timestamp": "2025-09-02 14:13:59.424656", "step": 1139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:13:59.477993", "step": 1139, "epoch": 1 }, { "type": "loss", "content": 0.20674191415309906, "timestamp": "2025-09-02 14:13:59.483873", "step": 1140, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:01.387304", "step": 1140, "epoch": 1 }, { "type": "pplx", "content": 8567.59214310508, "timestamp": "2025-09-02 14:14:01.389494", "step": 1140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.442675", "step": 1140, "epoch": 1 }, { "type": "loss", "content": 0.19470080733299255, "timestamp": "2025-09-02 14:14:01.444860", "step": 1141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.498025", "step": 1141, "epoch": 1 }, { "type": "loss", "content": 0.14560677111148834, "timestamp": "2025-09-02 14:14:01.499303", "step": 1142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:01.552353", "step": 1142, "epoch": 1 }, { "type": "loss", "content": 0.3292316794395447, "timestamp": "2025-09-02 14:14:01.554479", "step": 1143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.607690", "step": 1143, "epoch": 1 }, { "type": "loss", "content": 0.09268055111169815, "timestamp": "2025-09-02 14:14:01.613998", "step": 1144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.666042", "step": 1144, "epoch": 1 }, { "type": "loss", "content": 0.22264838218688965, "timestamp": "2025-09-02 14:14:01.668154", "step": 1145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.721328", "step": 1145, "epoch": 1 }, { "type": "loss", "content": 0.22078295052051544, "timestamp": "2025-09-02 14:14:01.724755", "step": 1146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.778054", "step": 1146, "epoch": 1 }, { "type": "loss", "content": 0.16867998242378235, "timestamp": "2025-09-02 14:14:01.780473", "step": 1147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.834843", "step": 1147, "epoch": 1 }, { "type": "loss", "content": 0.2256784439086914, "timestamp": "2025-09-02 14:14:01.840450", "step": 1148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:01.894427", "step": 1148, "epoch": 1 }, { "type": "loss", "content": 0.2493252009153366, "timestamp": "2025-09-02 14:14:01.896126", "step": 1149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:01.948498", "step": 1149, "epoch": 1 }, { "type": "loss", "content": 0.22366905212402344, "timestamp": "2025-09-02 14:14:01.950817", "step": 1150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.003722", "step": 1150, "epoch": 1 }, { "type": "loss", "content": 0.1892005354166031, "timestamp": "2025-09-02 14:14:02.005812", "step": 1151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.058897", "step": 1151, "epoch": 1 }, { "type": "loss", "content": 0.07456947863101959, "timestamp": "2025-09-02 14:14:02.065228", "step": 1152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.118060", "step": 1152, "epoch": 1 }, { "type": "loss", "content": 0.1476522535085678, "timestamp": "2025-09-02 14:14:02.120385", "step": 1153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.174773", "step": 1153, "epoch": 1 }, { "type": "loss", "content": 0.1921290159225464, "timestamp": "2025-09-02 14:14:02.177213", "step": 1154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.231083", "step": 1154, "epoch": 1 }, { "type": "loss", "content": 0.2406744658946991, "timestamp": "2025-09-02 14:14:02.233666", "step": 1155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.286272", "step": 1155, "epoch": 1 }, { "type": "loss", "content": 0.1965147852897644, "timestamp": "2025-09-02 14:14:02.292415", "step": 1156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.345072", "step": 1156, "epoch": 1 }, { "type": "loss", "content": 0.2052738219499588, "timestamp": "2025-09-02 14:14:02.348117", "step": 1157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.400726", "step": 1157, "epoch": 1 }, { "type": "loss", "content": 0.12430580705404282, "timestamp": "2025-09-02 14:14:02.402813", "step": 1158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.456448", "step": 1158, "epoch": 1 }, { "type": "loss", "content": 0.15697512030601501, "timestamp": "2025-09-02 14:14:02.460600", "step": 1159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:02.514036", "step": 1159, "epoch": 1 }, { "type": "loss", "content": 0.34209051728248596, "timestamp": "2025-09-02 14:14:02.520029", "step": 1160, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:04.436438", "step": 1160, "epoch": 1 }, { "type": "pplx", "content": 8716.429952581291, "timestamp": "2025-09-02 14:14:04.438725", "step": 1160, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1160", "timestamp": "2025-09-02 14:14:04.871258", "step": 1160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:04.926514", "step": 1160, "epoch": 1 }, { "type": "loss", "content": 0.4896734058856964, "timestamp": "2025-09-02 14:14:04.928598", "step": 1161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:04.983052", "step": 1161, "epoch": 1 }, { "type": "loss", "content": 0.14651475846767426, "timestamp": "2025-09-02 14:14:04.985327", "step": 1162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.038841", "step": 1162, "epoch": 1 }, { "type": "loss", "content": 0.16895799338817596, "timestamp": "2025-09-02 14:14:05.041119", "step": 1163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.094865", "step": 1163, "epoch": 1 }, { "type": "loss", "content": 0.3238910138607025, "timestamp": "2025-09-02 14:14:05.100826", "step": 1164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:05.154045", "step": 1164, "epoch": 1 }, { "type": "loss", "content": 0.1637531965970993, "timestamp": "2025-09-02 14:14:05.156238", "step": 1165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:05.213800", "step": 1165, "epoch": 1 }, { "type": "loss", "content": 0.13800232112407684, "timestamp": "2025-09-02 14:14:05.216006", "step": 1166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.273193", "step": 1166, "epoch": 1 }, { "type": "loss", "content": 0.20565161108970642, "timestamp": "2025-09-02 14:14:05.275527", "step": 1167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:05.329762", "step": 1167, "epoch": 1 }, { "type": "loss", "content": 0.4508979618549347, "timestamp": "2025-09-02 14:14:05.335499", "step": 1168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.391697", "step": 1168, "epoch": 1 }, { "type": "loss", "content": 0.1969025582075119, "timestamp": "2025-09-02 14:14:05.393852", "step": 1169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:05.451105", "step": 1169, "epoch": 1 }, { "type": "loss", "content": 0.20033395290374756, "timestamp": "2025-09-02 14:14:05.453411", "step": 1170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:05.507287", "step": 1170, "epoch": 1 }, { "type": "loss", "content": 0.1943138986825943, "timestamp": "2025-09-02 14:14:05.509752", "step": 1171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.574170", "step": 1171, "epoch": 1 }, { "type": "loss", "content": 0.19612300395965576, "timestamp": "2025-09-02 14:14:05.581531", "step": 1172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.638330", "step": 1172, "epoch": 1 }, { "type": "loss", "content": 0.1914314478635788, "timestamp": "2025-09-02 14:14:05.641925", "step": 1173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:05.698895", "step": 1173, "epoch": 1 }, { "type": "loss", "content": 0.16725850105285645, "timestamp": "2025-09-02 14:14:05.700911", "step": 1174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:05.758523", "step": 1174, "epoch": 1 }, { "type": "loss", "content": 0.22042731940746307, "timestamp": "2025-09-02 14:14:05.760939", "step": 1175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.815195", "step": 1175, "epoch": 1 }, { "type": "loss", "content": 0.4225582182407379, "timestamp": "2025-09-02 14:14:05.820793", "step": 1176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.875226", "step": 1176, "epoch": 1 }, { "type": "loss", "content": 0.09097928553819656, "timestamp": "2025-09-02 14:14:05.878533", "step": 1177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.932511", "step": 1177, "epoch": 1 }, { "type": "loss", "content": 0.1302945613861084, "timestamp": "2025-09-02 14:14:05.936680", "step": 1178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:05.990112", "step": 1178, "epoch": 1 }, { "type": "loss", "content": 0.2025027871131897, "timestamp": "2025-09-02 14:14:05.992237", "step": 1179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:06.046796", "step": 1179, "epoch": 1 }, { "type": "loss", "content": 0.2442786693572998, "timestamp": "2025-09-02 14:14:06.052740", "step": 1180, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:07.914790", "step": 1180, "epoch": 1 }, { "type": "pplx", "content": 8554.290156561585, "timestamp": "2025-09-02 14:14:07.919435", "step": 1180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:07.975034", "step": 1180, "epoch": 1 }, { "type": "loss", "content": 0.401386559009552, "timestamp": "2025-09-02 14:14:07.977374", "step": 1181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.032217", "step": 1181, "epoch": 1 }, { "type": "loss", "content": 0.25277742743492126, "timestamp": "2025-09-02 14:14:08.034508", "step": 1182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:08.087780", "step": 1182, "epoch": 1 }, { "type": "loss", "content": 0.2643093168735504, "timestamp": "2025-09-02 14:14:08.090031", "step": 1183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.142481", "step": 1183, "epoch": 1 }, { "type": "loss", "content": 0.10128039121627808, "timestamp": "2025-09-02 14:14:08.148490", "step": 1184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:08.200735", "step": 1184, "epoch": 1 }, { "type": "loss", "content": 0.17952582240104675, "timestamp": "2025-09-02 14:14:08.203045", "step": 1185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.256318", "step": 1185, "epoch": 1 }, { "type": "loss", "content": 0.12137694656848907, "timestamp": "2025-09-02 14:14:08.258699", "step": 1186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.311860", "step": 1186, "epoch": 1 }, { "type": "loss", "content": 0.12925201654434204, "timestamp": "2025-09-02 14:14:08.314329", "step": 1187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.367055", "step": 1187, "epoch": 1 }, { "type": "loss", "content": 0.23439918458461761, "timestamp": "2025-09-02 14:14:08.373309", "step": 1188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.428258", "step": 1188, "epoch": 1 }, { "type": "loss", "content": 0.15372543036937714, "timestamp": "2025-09-02 14:14:08.430423", "step": 1189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.483732", "step": 1189, "epoch": 1 }, { "type": "loss", "content": 0.2876153290271759, "timestamp": "2025-09-02 14:14:08.485827", "step": 1190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:08.539249", "step": 1190, "epoch": 1 }, { "type": "loss", "content": 0.15462076663970947, "timestamp": "2025-09-02 14:14:08.541509", "step": 1191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.594261", "step": 1191, "epoch": 1 }, { "type": "loss", "content": 0.23681513965129852, "timestamp": "2025-09-02 14:14:08.600475", "step": 1192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.652660", "step": 1192, "epoch": 1 }, { "type": "loss", "content": 0.2153930962085724, "timestamp": "2025-09-02 14:14:08.654817", "step": 1193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.707819", "step": 1193, "epoch": 1 }, { "type": "loss", "content": 0.18096938729286194, "timestamp": "2025-09-02 14:14:08.710042", "step": 1194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.764798", "step": 1194, "epoch": 1 }, { "type": "loss", "content": 0.17033451795578003, "timestamp": "2025-09-02 14:14:08.767278", "step": 1195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.819801", "step": 1195, "epoch": 1 }, { "type": "loss", "content": 0.2088293731212616, "timestamp": "2025-09-02 14:14:08.825916", "step": 1196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.878494", "step": 1196, "epoch": 1 }, { "type": "loss", "content": 0.1450466811656952, "timestamp": "2025-09-02 14:14:08.880729", "step": 1197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:08.933293", "step": 1197, "epoch": 1 }, { "type": "loss", "content": 0.05922660231590271, "timestamp": "2025-09-02 14:14:08.935785", "step": 1198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:08.988632", "step": 1198, "epoch": 1 }, { "type": "loss", "content": 0.15073707699775696, "timestamp": "2025-09-02 14:14:08.991000", "step": 1199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:09.044294", "step": 1199, "epoch": 1 }, { "type": "loss", "content": 0.4279422163963318, "timestamp": "2025-09-02 14:14:09.050424", "step": 1200, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:10.913894", "step": 1200, "epoch": 1 }, { "type": "pplx", "content": 8206.325998173606, "timestamp": "2025-09-02 14:14:10.916072", "step": 1200, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1200", "timestamp": "2025-09-02 14:14:11.273753", "step": 1200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.326925", "step": 1200, "epoch": 1 }, { "type": "loss", "content": 0.1418149173259735, "timestamp": "2025-09-02 14:14:11.329336", "step": 1201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.383274", "step": 1201, "epoch": 1 }, { "type": "loss", "content": 0.18102554976940155, "timestamp": "2025-09-02 14:14:11.385558", "step": 1202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.438590", "step": 1202, "epoch": 1 }, { "type": "loss", "content": 0.12972970306873322, "timestamp": "2025-09-02 14:14:11.440990", "step": 1203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.493610", "step": 1203, "epoch": 1 }, { "type": "loss", "content": 0.14274586737155914, "timestamp": "2025-09-02 14:14:11.500886", "step": 1204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.553177", "step": 1204, "epoch": 1 }, { "type": "loss", "content": 0.34080010652542114, "timestamp": "2025-09-02 14:14:11.555195", "step": 1205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.608251", "step": 1205, "epoch": 1 }, { "type": "loss", "content": 0.16504894196987152, "timestamp": "2025-09-02 14:14:11.610533", "step": 1206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.663537", "step": 1206, "epoch": 1 }, { "type": "loss", "content": 0.22156301140785217, "timestamp": "2025-09-02 14:14:11.665819", "step": 1207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:11.718822", "step": 1207, "epoch": 1 }, { "type": "loss", "content": 0.28300073742866516, "timestamp": "2025-09-02 14:14:11.724643", "step": 1208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:11.777047", "step": 1208, "epoch": 1 }, { "type": "loss", "content": 0.19305896759033203, "timestamp": "2025-09-02 14:14:11.779828", "step": 1209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.832832", "step": 1209, "epoch": 1 }, { "type": "loss", "content": 0.14319440722465515, "timestamp": "2025-09-02 14:14:11.835427", "step": 1210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:11.887692", "step": 1210, "epoch": 1 }, { "type": "loss", "content": 0.2348383367061615, "timestamp": "2025-09-02 14:14:11.890034", "step": 1211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:11.942783", "step": 1211, "epoch": 1 }, { "type": "loss", "content": 0.10170012712478638, "timestamp": "2025-09-02 14:14:11.948445", "step": 1212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.001362", "step": 1212, "epoch": 1 }, { "type": "loss", "content": 0.12675535678863525, "timestamp": "2025-09-02 14:14:12.003701", "step": 1213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.057145", "step": 1213, "epoch": 1 }, { "type": "loss", "content": 0.35492652654647827, "timestamp": "2025-09-02 14:14:12.059716", "step": 1214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.113569", "step": 1214, "epoch": 1 }, { "type": "loss", "content": 0.20785874128341675, "timestamp": "2025-09-02 14:14:12.115746", "step": 1215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.170459", "step": 1215, "epoch": 1 }, { "type": "loss", "content": 0.19548150897026062, "timestamp": "2025-09-02 14:14:12.176442", "step": 1216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.228678", "step": 1216, "epoch": 1 }, { "type": "loss", "content": 0.19592846930027008, "timestamp": "2025-09-02 14:14:12.231232", "step": 1217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:12.284320", "step": 1217, "epoch": 1 }, { "type": "loss", "content": 0.20596441626548767, "timestamp": "2025-09-02 14:14:12.287109", "step": 1218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.339637", "step": 1218, "epoch": 1 }, { "type": "loss", "content": 0.3359159231185913, "timestamp": "2025-09-02 14:14:12.342050", "step": 1219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:12.394797", "step": 1219, "epoch": 1 }, { "type": "loss", "content": 0.28477221727371216, "timestamp": "2025-09-02 14:14:12.400545", "step": 1220, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:14.271161", "step": 1220, "epoch": 1 }, { "type": "pplx", "content": 7935.596281781054, "timestamp": "2025-09-02 14:14:14.273265", "step": 1220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.325948", "step": 1220, "epoch": 1 }, { "type": "loss", "content": 0.21055620908737183, "timestamp": "2025-09-02 14:14:14.328380", "step": 1221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.381487", "step": 1221, "epoch": 1 }, { "type": "loss", "content": 0.1588701605796814, "timestamp": "2025-09-02 14:14:14.383568", "step": 1222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.437664", "step": 1222, "epoch": 1 }, { "type": "loss", "content": 0.1637565791606903, "timestamp": "2025-09-02 14:14:14.439925", "step": 1223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.492722", "step": 1223, "epoch": 1 }, { "type": "loss", "content": 0.2613271474838257, "timestamp": "2025-09-02 14:14:14.498725", "step": 1224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.553604", "step": 1224, "epoch": 1 }, { "type": "loss", "content": 0.12546256184577942, "timestamp": "2025-09-02 14:14:14.555730", "step": 1225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.609076", "step": 1225, "epoch": 1 }, { "type": "loss", "content": 0.24357591569423676, "timestamp": "2025-09-02 14:14:14.611510", "step": 1226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.666487", "step": 1226, "epoch": 1 }, { "type": "loss", "content": 0.08241570740938187, "timestamp": "2025-09-02 14:14:14.668721", "step": 1227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.721857", "step": 1227, "epoch": 1 }, { "type": "loss", "content": 0.2688785791397095, "timestamp": "2025-09-02 14:14:14.727646", "step": 1228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.779791", "step": 1228, "epoch": 1 }, { "type": "loss", "content": 0.20391713082790375, "timestamp": "2025-09-02 14:14:14.782035", "step": 1229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.835192", "step": 1229, "epoch": 1 }, { "type": "loss", "content": 0.12013918906450272, "timestamp": "2025-09-02 14:14:14.837398", "step": 1230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.890325", "step": 1230, "epoch": 1 }, { "type": "loss", "content": 0.16450339555740356, "timestamp": "2025-09-02 14:14:14.893804", "step": 1231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:14.946757", "step": 1231, "epoch": 1 }, { "type": "loss", "content": 0.09386026859283447, "timestamp": "2025-09-02 14:14:14.952972", "step": 1232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.005715", "step": 1232, "epoch": 1 }, { "type": "loss", "content": 0.18070338666439056, "timestamp": "2025-09-02 14:14:15.007889", "step": 1233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.061397", "step": 1233, "epoch": 1 }, { "type": "loss", "content": 0.20766712725162506, "timestamp": "2025-09-02 14:14:15.063714", "step": 1234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:15.116701", "step": 1234, "epoch": 1 }, { "type": "loss", "content": 0.10697361826896667, "timestamp": "2025-09-02 14:14:15.121421", "step": 1235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.175105", "step": 1235, "epoch": 1 }, { "type": "loss", "content": 0.14173458516597748, "timestamp": "2025-09-02 14:14:15.181167", "step": 1236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.233815", "step": 1236, "epoch": 1 }, { "type": "loss", "content": 0.15189585089683533, "timestamp": "2025-09-02 14:14:15.235955", "step": 1237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:15.288857", "step": 1237, "epoch": 1 }, { "type": "loss", "content": 0.14462704956531525, "timestamp": "2025-09-02 14:14:15.291185", "step": 1238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.345168", "step": 1238, "epoch": 1 }, { "type": "loss", "content": 0.24822859466075897, "timestamp": "2025-09-02 14:14:15.347716", "step": 1239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:15.400727", "step": 1239, "epoch": 1 }, { "type": "loss", "content": 0.21943506598472595, "timestamp": "2025-09-02 14:14:15.406821", "step": 1240, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:17.308226", "step": 1240, "epoch": 1 }, { "type": "pplx", "content": 7708.298747955183, "timestamp": "2025-09-02 14:14:17.310262", "step": 1240, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1240", "timestamp": "2025-09-02 14:14:17.704052", "step": 1240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:17.760097", "step": 1240, "epoch": 1 }, { "type": "loss", "content": 0.2539196014404297, "timestamp": "2025-09-02 14:14:17.762273", "step": 1241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:17.816671", "step": 1241, "epoch": 1 }, { "type": "loss", "content": 0.2425844818353653, "timestamp": "2025-09-02 14:14:17.818836", "step": 1242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:17.872037", "step": 1242, "epoch": 1 }, { "type": "loss", "content": 0.08658114075660706, "timestamp": "2025-09-02 14:14:17.874444", "step": 1243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:17.927763", "step": 1243, "epoch": 1 }, { "type": "loss", "content": 0.21430689096450806, "timestamp": "2025-09-02 14:14:17.936000", "step": 1244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:17.989486", "step": 1244, "epoch": 1 }, { "type": "loss", "content": 0.24322041869163513, "timestamp": "2025-09-02 14:14:17.991689", "step": 1245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.044796", "step": 1245, "epoch": 1 }, { "type": "loss", "content": 0.2037028819322586, "timestamp": "2025-09-02 14:14:18.047004", "step": 1246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:18.100744", "step": 1246, "epoch": 1 }, { "type": "loss", "content": 0.22830410301685333, "timestamp": "2025-09-02 14:14:18.103205", "step": 1247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.157662", "step": 1247, "epoch": 1 }, { "type": "loss", "content": 0.14905725419521332, "timestamp": "2025-09-02 14:14:18.164801", "step": 1248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:18.217754", "step": 1248, "epoch": 1 }, { "type": "loss", "content": 0.16696220636367798, "timestamp": "2025-09-02 14:14:18.219902", "step": 1249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:18.273835", "step": 1249, "epoch": 1 }, { "type": "loss", "content": 0.20929457247257233, "timestamp": "2025-09-02 14:14:18.276141", "step": 1250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.329136", "step": 1250, "epoch": 1 }, { "type": "loss", "content": 0.2007312923669815, "timestamp": "2025-09-02 14:14:18.331686", "step": 1251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.384744", "step": 1251, "epoch": 1 }, { "type": "loss", "content": 0.1480093002319336, "timestamp": "2025-09-02 14:14:18.390300", "step": 1252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.442193", "step": 1252, "epoch": 1 }, { "type": "loss", "content": 0.2349192500114441, "timestamp": "2025-09-02 14:14:18.444417", "step": 1253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.497004", "step": 1253, "epoch": 1 }, { "type": "loss", "content": 0.3215111196041107, "timestamp": "2025-09-02 14:14:18.499197", "step": 1254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.551974", "step": 1254, "epoch": 1 }, { "type": "loss", "content": 0.21636608242988586, "timestamp": "2025-09-02 14:14:18.557312", "step": 1255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:18.614587", "step": 1255, "epoch": 1 }, { "type": "loss", "content": 0.362751305103302, "timestamp": "2025-09-02 14:14:18.620492", "step": 1256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:18.674112", "step": 1256, "epoch": 1 }, { "type": "loss", "content": 0.253154456615448, "timestamp": "2025-09-02 14:14:18.676290", "step": 1257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.729059", "step": 1257, "epoch": 1 }, { "type": "loss", "content": 0.17429713904857635, "timestamp": "2025-09-02 14:14:18.731263", "step": 1258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:18.784092", "step": 1258, "epoch": 1 }, { "type": "loss", "content": 0.2392328828573227, "timestamp": "2025-09-02 14:14:18.786302", "step": 1259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:18.839303", "step": 1259, "epoch": 1 }, { "type": "loss", "content": 0.2070467621088028, "timestamp": "2025-09-02 14:14:18.846362", "step": 1260, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:20.728942", "step": 1260, "epoch": 1 }, { "type": "pplx", "content": 7646.443443150275, "timestamp": "2025-09-02 14:14:20.731033", "step": 1260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:20.782719", "step": 1260, "epoch": 1 }, { "type": "loss", "content": 0.27366897463798523, "timestamp": "2025-09-02 14:14:20.784940", "step": 1261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:20.838036", "step": 1261, "epoch": 1 }, { "type": "loss", "content": 0.23465193808078766, "timestamp": "2025-09-02 14:14:20.840143", "step": 1262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:20.896824", "step": 1262, "epoch": 1 }, { "type": "loss", "content": 0.1078827753663063, "timestamp": "2025-09-02 14:14:20.899023", "step": 1263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:20.952472", "step": 1263, "epoch": 1 }, { "type": "loss", "content": 0.13949327170848846, "timestamp": "2025-09-02 14:14:20.958570", "step": 1264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.011018", "step": 1264, "epoch": 1 }, { "type": "loss", "content": 0.2596402168273926, "timestamp": "2025-09-02 14:14:21.013271", "step": 1265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.066943", "step": 1265, "epoch": 1 }, { "type": "loss", "content": 0.1250077188014984, "timestamp": "2025-09-02 14:14:21.070570", "step": 1266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.123759", "step": 1266, "epoch": 1 }, { "type": "loss", "content": 0.10547887533903122, "timestamp": "2025-09-02 14:14:21.126167", "step": 1267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.179544", "step": 1267, "epoch": 1 }, { "type": "loss", "content": 0.2196536660194397, "timestamp": "2025-09-02 14:14:21.185698", "step": 1268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.239281", "step": 1268, "epoch": 1 }, { "type": "loss", "content": 0.17626355588436127, "timestamp": "2025-09-02 14:14:21.241688", "step": 1269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.295273", "step": 1269, "epoch": 1 }, { "type": "loss", "content": 0.14312846958637238, "timestamp": "2025-09-02 14:14:21.297446", "step": 1270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.350292", "step": 1270, "epoch": 1 }, { "type": "loss", "content": 0.10231005400419235, "timestamp": "2025-09-02 14:14:21.352435", "step": 1271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:21.405927", "step": 1271, "epoch": 1 }, { "type": "loss", "content": 0.2080439031124115, "timestamp": "2025-09-02 14:14:21.411930", "step": 1272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.464496", "step": 1272, "epoch": 1 }, { "type": "loss", "content": 0.32065096497535706, "timestamp": "2025-09-02 14:14:21.466611", "step": 1273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.519184", "step": 1273, "epoch": 1 }, { "type": "loss", "content": 0.16446471214294434, "timestamp": "2025-09-02 14:14:21.521359", "step": 1274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.574093", "step": 1274, "epoch": 1 }, { "type": "loss", "content": 0.21845583617687225, "timestamp": "2025-09-02 14:14:21.577438", "step": 1275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:21.631580", "step": 1275, "epoch": 1 }, { "type": "loss", "content": 0.2322443127632141, "timestamp": "2025-09-02 14:14:21.637599", "step": 1276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.690002", "step": 1276, "epoch": 1 }, { "type": "loss", "content": 0.25114426016807556, "timestamp": "2025-09-02 14:14:21.692213", "step": 1277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:21.746233", "step": 1277, "epoch": 1 }, { "type": "loss", "content": 0.13827699422836304, "timestamp": "2025-09-02 14:14:21.748353", "step": 1278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.804633", "step": 1278, "epoch": 1 }, { "type": "loss", "content": 0.2678951621055603, "timestamp": "2025-09-02 14:14:21.806986", "step": 1279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:21.860085", "step": 1279, "epoch": 1 }, { "type": "loss", "content": 0.16601842641830444, "timestamp": "2025-09-02 14:14:21.866037", "step": 1280, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:23.750695", "step": 1280, "epoch": 1 }, { "type": "pplx", "content": 7567.17489625581, "timestamp": "2025-09-02 14:14:23.753368", "step": 1280, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 1280", "timestamp": "2025-09-02 14:14:24.113438", "step": 1280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:24.168163", "step": 1280, "epoch": 1 }, { "type": "loss", "content": 0.1284349113702774, "timestamp": "2025-09-02 14:14:24.170135", "step": 1281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:24.223191", "step": 1281, "epoch": 1 }, { "type": "loss", "content": 0.1377388834953308, "timestamp": "2025-09-02 14:14:24.225880", "step": 1282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.279362", "step": 1282, "epoch": 1 }, { "type": "loss", "content": 0.18503282964229584, "timestamp": "2025-09-02 14:14:24.282001", "step": 1283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.335888", "step": 1283, "epoch": 1 }, { "type": "loss", "content": 0.24435697495937347, "timestamp": "2025-09-02 14:14:24.342194", "step": 1284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.394443", "step": 1284, "epoch": 1 }, { "type": "loss", "content": 0.1727815866470337, "timestamp": "2025-09-02 14:14:24.396905", "step": 1285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.450165", "step": 1285, "epoch": 1 }, { "type": "loss", "content": 0.0916629284620285, "timestamp": "2025-09-02 14:14:24.452413", "step": 1286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.504905", "step": 1286, "epoch": 1 }, { "type": "loss", "content": 0.21359987556934357, "timestamp": "2025-09-02 14:14:24.507775", "step": 1287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.563073", "step": 1287, "epoch": 1 }, { "type": "loss", "content": 0.22259066998958588, "timestamp": "2025-09-02 14:14:24.572245", "step": 1288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.627512", "step": 1288, "epoch": 1 }, { "type": "loss", "content": 0.20751428604125977, "timestamp": "2025-09-02 14:14:24.629476", "step": 1289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.684591", "step": 1289, "epoch": 1 }, { "type": "loss", "content": 0.1772179901599884, "timestamp": "2025-09-02 14:14:24.686721", "step": 1290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.740780", "step": 1290, "epoch": 1 }, { "type": "loss", "content": 0.16223381459712982, "timestamp": "2025-09-02 14:14:24.742985", "step": 1291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.798160", "step": 1291, "epoch": 1 }, { "type": "loss", "content": 0.1592487245798111, "timestamp": "2025-09-02 14:14:24.803798", "step": 1292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:24.857757", "step": 1292, "epoch": 1 }, { "type": "loss", "content": 0.19102619588375092, "timestamp": "2025-09-02 14:14:24.859807", "step": 1293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.919050", "step": 1293, "epoch": 1 }, { "type": "loss", "content": 0.12474578619003296, "timestamp": "2025-09-02 14:14:24.921408", "step": 1294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:24.975062", "step": 1294, "epoch": 1 }, { "type": "loss", "content": 0.18288032710552216, "timestamp": "2025-09-02 14:14:24.977365", "step": 1295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:25.031138", "step": 1295, "epoch": 1 }, { "type": "loss", "content": 0.3666345179080963, "timestamp": "2025-09-02 14:14:25.037038", "step": 1296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:25.089070", "step": 1296, "epoch": 1 }, { "type": "loss", "content": 0.14100699126720428, "timestamp": "2025-09-02 14:14:25.091212", "step": 1297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:25.144650", "step": 1297, "epoch": 1 }, { "type": "loss", "content": 0.1241668164730072, "timestamp": "2025-09-02 14:14:25.147442", "step": 1298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:25.200837", "step": 1298, "epoch": 1 }, { "type": "loss", "content": 0.225935161113739, "timestamp": "2025-09-02 14:14:25.203092", "step": 1299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:25.256501", "step": 1299, "epoch": 1 }, { "type": "loss", "content": 0.17786350846290588, "timestamp": "2025-09-02 14:14:25.262556", "step": 1300, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:27.190880", "step": 1300, "epoch": 1 }, { "type": "pplx", "content": 7531.533659812799, "timestamp": "2025-09-02 14:14:27.193398", "step": 1300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.246396", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.17551372945308685, "timestamp": "2025-09-02 14:14:27.249722", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:27.304198", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.1373559981584549, "timestamp": "2025-09-02 14:14:27.307706", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:27.364043", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.23794730007648468, "timestamp": "2025-09-02 14:14:27.366214", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.418759", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.2141202688217163, "timestamp": "2025-09-02 14:14:27.425084", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:27.486535", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.12452469766139984, "timestamp": "2025-09-02 14:14:27.488675", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.541643", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.21510975062847137, "timestamp": "2025-09-02 14:14:27.543983", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.597050", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.2329224944114685, "timestamp": "2025-09-02 14:14:27.599083", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.651645", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.2361801415681839, "timestamp": "2025-09-02 14:14:27.658763", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.710974", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.17188814282417297, "timestamp": "2025-09-02 14:14:27.713294", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:27.766770", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.11970847100019455, "timestamp": "2025-09-02 14:14:27.768711", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.823084", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.13274654746055603, "timestamp": "2025-09-02 14:14:27.825805", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:27.879188", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.12142153084278107, "timestamp": "2025-09-02 14:14:27.885320", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.937724", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.13564428687095642, "timestamp": "2025-09-02 14:14:27.939914", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:27.992747", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.2114870548248291, "timestamp": "2025-09-02 14:14:27.995324", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:28.048684", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.3009539842605591, "timestamp": "2025-09-02 14:14:28.051198", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:28.105330", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.21291516721248627, "timestamp": "2025-09-02 14:14:28.111403", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:28.164067", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.10387202352285385, "timestamp": "2025-09-02 14:14:28.166177", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:28.221266", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.1628091037273407, "timestamp": "2025-09-02 14:14:28.223526", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:28.278239", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.25775715708732605, "timestamp": "2025-09-02 14:14:28.280253", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:28.334310", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.17408624291419983, "timestamp": "2025-09-02 14:14:28.340102", "step": 1320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:30.224401", "step": 1320, "epoch": 2 }, { "type": "pplx", "content": 7460.290098876051, "timestamp": "2025-09-02 14:14:30.226487", "step": 1320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1320", "timestamp": "2025-09-02 14:14:30.750924", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:30.806925", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.1739158034324646, "timestamp": "2025-09-02 14:14:30.809131", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:30.863744", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.13787204027175903, "timestamp": "2025-09-02 14:14:30.866058", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:30.919043", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.20546895265579224, "timestamp": "2025-09-02 14:14:30.920961", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:30.974524", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.20111192762851715, "timestamp": "2025-09-02 14:14:30.980644", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:31.033687", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.2075800597667694, "timestamp": "2025-09-02 14:14:31.036008", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.088701", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.18741843104362488, "timestamp": "2025-09-02 14:14:31.090971", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.144510", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.25890088081359863, "timestamp": "2025-09-02 14:14:31.146785", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:31.199708", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.21647875010967255, "timestamp": "2025-09-02 14:14:31.205898", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:31.258922", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.160391703248024, "timestamp": "2025-09-02 14:14:31.261410", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.317526", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.2411142736673355, "timestamp": "2025-09-02 14:14:31.319869", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.373613", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.1616753339767456, "timestamp": "2025-09-02 14:14:31.376038", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.445500", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.13925251364707947, "timestamp": "2025-09-02 14:14:31.451279", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.505624", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.13860081136226654, "timestamp": "2025-09-02 14:14:31.507876", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.560549", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.2602631151676178, "timestamp": "2025-09-02 14:14:31.563169", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.616244", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.29541292786598206, "timestamp": "2025-09-02 14:14:31.617995", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:31.670923", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.22453294694423676, "timestamp": "2025-09-02 14:14:31.676767", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.729064", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.1295289844274521, "timestamp": "2025-09-02 14:14:31.731449", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:31.784880", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.24640415608882904, "timestamp": "2025-09-02 14:14:31.787008", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:31.840855", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.25579389929771423, "timestamp": "2025-09-02 14:14:31.843700", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:31.896844", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.07167704403400421, "timestamp": "2025-09-02 14:14:31.902797", "step": 1340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:33.798537", "step": 1340, "epoch": 2 }, { "type": "pplx", "content": 7418.234727920731, "timestamp": "2025-09-02 14:14:33.800860", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:33.854423", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.1810855120420456, "timestamp": "2025-09-02 14:14:33.856928", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:33.910788", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.12535697221755981, "timestamp": "2025-09-02 14:14:33.912729", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:33.965424", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.24519287049770355, "timestamp": "2025-09-02 14:14:33.967761", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:34.021246", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.0914510115981102, "timestamp": "2025-09-02 14:14:34.027282", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.079902", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.18005388975143433, "timestamp": "2025-09-02 14:14:34.082905", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.135774", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.2733306288719177, "timestamp": "2025-09-02 14:14:34.138329", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.191494", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.19697614014148712, "timestamp": "2025-09-02 14:14:34.193899", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.246527", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.07799861580133438, "timestamp": "2025-09-02 14:14:34.252447", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:34.306373", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.11111468821763992, "timestamp": "2025-09-02 14:14:34.310923", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:34.367363", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.37715038657188416, "timestamp": "2025-09-02 14:14:34.369455", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.430186", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.23658736050128937, "timestamp": "2025-09-02 14:14:34.432349", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.485609", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.19703595340251923, "timestamp": "2025-09-02 14:14:34.491701", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:34.544335", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.3663310110569, "timestamp": "2025-09-02 14:14:34.548192", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.606106", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.22059214115142822, "timestamp": "2025-09-02 14:14:34.608601", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:34.663611", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.17066656053066254, "timestamp": "2025-09-02 14:14:34.665889", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.723021", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.14390265941619873, "timestamp": "2025-09-02 14:14:34.729326", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:34.788770", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.33753806352615356, "timestamp": "2025-09-02 14:14:34.790981", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:34.843904", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.14723411202430725, "timestamp": "2025-09-02 14:14:34.846439", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:34.900425", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.24935570359230042, "timestamp": "2025-09-02 14:14:34.902598", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:34.955648", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.19726720452308655, "timestamp": "2025-09-02 14:14:34.961642", "step": 1360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:36.852006", "step": 1360, "epoch": 2 }, { "type": "pplx", "content": 7201.636243999576, "timestamp": "2025-09-02 14:14:36.854167", "step": 1360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1360", "timestamp": "2025-09-02 14:14:37.212467", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:37.268526", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.03516945615410805, "timestamp": "2025-09-02 14:14:37.270684", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.325258", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.29295191168785095, "timestamp": "2025-09-02 14:14:37.327825", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:37.382028", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.11440539360046387, "timestamp": "2025-09-02 14:14:37.384355", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.437460", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.12829262018203735, "timestamp": "2025-09-02 14:14:37.443580", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.495940", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.17149397730827332, "timestamp": "2025-09-02 14:14:37.497934", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:37.550862", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.30899283289909363, "timestamp": "2025-09-02 14:14:37.553114", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.606227", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.19153793156147003, "timestamp": "2025-09-02 14:14:37.608366", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:37.661414", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.10122761130332947, "timestamp": "2025-09-02 14:14:37.667546", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:37.719752", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.2087310552597046, "timestamp": "2025-09-02 14:14:37.721947", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:37.774699", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.13941149413585663, "timestamp": "2025-09-02 14:14:37.776962", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.829760", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.2824164628982544, "timestamp": "2025-09-02 14:14:37.831985", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:37.887085", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.16944414377212524, "timestamp": "2025-09-02 14:14:37.893066", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:37.945500", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.21587979793548584, "timestamp": "2025-09-02 14:14:37.947726", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.002183", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.10153981298208237, "timestamp": "2025-09-02 14:14:38.004277", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.057401", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.13731293380260468, "timestamp": "2025-09-02 14:14:38.059598", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:38.112813", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.12170983850955963, "timestamp": "2025-09-02 14:14:38.118758", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.171273", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.18269094824790955, "timestamp": "2025-09-02 14:14:38.173617", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.226539", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.29655691981315613, "timestamp": "2025-09-02 14:14:38.228859", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.281842", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.3428836166858673, "timestamp": "2025-09-02 14:14:38.284138", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:38.337079", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.20560967922210693, "timestamp": "2025-09-02 14:14:38.343041", "step": 1380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:40.224348", "step": 1380, "epoch": 2 }, { "type": "pplx", "content": 7051.019466307469, "timestamp": "2025-09-02 14:14:40.226980", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:40.280779", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.1909906566143036, "timestamp": "2025-09-02 14:14:40.283139", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:40.337278", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.1757160723209381, "timestamp": "2025-09-02 14:14:40.339572", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:40.393072", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.10302970558404922, "timestamp": "2025-09-02 14:14:40.395041", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.447218", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.2825024127960205, "timestamp": "2025-09-02 14:14:40.453219", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.505255", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.15318083763122559, "timestamp": "2025-09-02 14:14:40.507497", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.560968", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.2102709859609604, "timestamp": "2025-09-02 14:14:40.563214", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.616650", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.200689435005188, "timestamp": "2025-09-02 14:14:40.619699", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.672856", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.23803895711898804, "timestamp": "2025-09-02 14:14:40.678711", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:40.731307", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.18716761469841003, "timestamp": "2025-09-02 14:14:40.733728", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.788472", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.18031685054302216, "timestamp": "2025-09-02 14:14:40.790547", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:40.843863", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.17205938696861267, "timestamp": "2025-09-02 14:14:40.846366", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:40.900233", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.16882839798927307, "timestamp": "2025-09-02 14:14:40.906123", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:40.957912", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.0816575363278389, "timestamp": "2025-09-02 14:14:40.960317", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:41.012686", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.18584515154361725, "timestamp": "2025-09-02 14:14:41.015002", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:41.068635", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.16501811146736145, "timestamp": "2025-09-02 14:14:41.070826", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:41.123790", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.13797646760940552, "timestamp": "2025-09-02 14:14:41.129774", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:41.182395", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.19324195384979248, "timestamp": "2025-09-02 14:14:41.185264", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:41.240680", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.1406335085630417, "timestamp": "2025-09-02 14:14:41.242918", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:41.299595", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.1929086446762085, "timestamp": "2025-09-02 14:14:41.302029", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:41.359891", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.18321073055267334, "timestamp": "2025-09-02 14:14:41.365804", "step": 1400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:43.279305", "step": 1400, "epoch": 2 }, { "type": "pplx", "content": 6950.62268446676, "timestamp": "2025-09-02 14:14:43.281328", "step": 1400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1400", "timestamp": "2025-09-02 14:14:43.648852", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:43.709177", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.18546761572360992, "timestamp": "2025-09-02 14:14:43.711510", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:43.767400", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.1371525079011917, "timestamp": "2025-09-02 14:14:43.769611", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:43.828850", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.12153467535972595, "timestamp": "2025-09-02 14:14:43.831214", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:43.884116", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.08500020205974579, "timestamp": "2025-09-02 14:14:43.890203", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:43.946873", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.3191583454608917, "timestamp": "2025-09-02 14:14:43.949039", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.004037", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.14298072457313538, "timestamp": "2025-09-02 14:14:44.006222", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.058962", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.30813539028167725, "timestamp": "2025-09-02 14:14:44.062120", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.120632", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.09666888415813446, "timestamp": "2025-09-02 14:14:44.126820", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:44.182110", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.13264644145965576, "timestamp": "2025-09-02 14:14:44.184604", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.237664", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.19713634252548218, "timestamp": "2025-09-02 14:14:44.240189", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:44.293761", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.25039568543434143, "timestamp": "2025-09-02 14:14:44.295717", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.348642", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.19423042237758636, "timestamp": "2025-09-02 14:14:44.355071", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.407733", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.23839014768600464, "timestamp": "2025-09-02 14:14:44.410024", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.463621", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.22086109220981598, "timestamp": "2025-09-02 14:14:44.465982", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.519871", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.20450030267238617, "timestamp": "2025-09-02 14:14:44.522048", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.576077", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.20490166544914246, "timestamp": "2025-09-02 14:14:44.582502", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:44.636198", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.243586003780365, "timestamp": "2025-09-02 14:14:44.638481", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.692141", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.11942626535892487, "timestamp": "2025-09-02 14:14:44.694341", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:44.747772", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.24939124286174774, "timestamp": "2025-09-02 14:14:44.750665", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:44.803674", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.14054431021213531, "timestamp": "2025-09-02 14:14:44.809658", "step": 1420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:46.690520", "step": 1420, "epoch": 2 }, { "type": "pplx", "content": 6981.136920904981, "timestamp": "2025-09-02 14:14:46.692646", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:46.746148", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.1280570775270462, "timestamp": "2025-09-02 14:14:46.748420", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:46.801661", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.29478156566619873, "timestamp": "2025-09-02 14:14:46.804988", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:46.857654", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.11176256090402603, "timestamp": "2025-09-02 14:14:46.860421", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:46.913512", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.28933340311050415, "timestamp": "2025-09-02 14:14:46.919826", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:46.972404", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.22844192385673523, "timestamp": "2025-09-02 14:14:46.975109", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.028467", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.16201670467853546, "timestamp": "2025-09-02 14:14:47.031158", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.084808", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.22300703823566437, "timestamp": "2025-09-02 14:14:47.087465", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.141028", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.11879327148199081, "timestamp": "2025-09-02 14:14:47.146974", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.199633", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.26346924901008606, "timestamp": "2025-09-02 14:14:47.203138", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.256316", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.2070111632347107, "timestamp": "2025-09-02 14:14:47.258712", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.312042", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.1286098212003708, "timestamp": "2025-09-02 14:14:47.314773", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.368247", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.12170419096946716, "timestamp": "2025-09-02 14:14:47.374605", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.427111", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.23191550374031067, "timestamp": "2025-09-02 14:14:47.429477", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.483110", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.17930705845355988, "timestamp": "2025-09-02 14:14:47.485463", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:47.538900", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.1733967661857605, "timestamp": "2025-09-02 14:14:47.541855", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.595696", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.18091271817684174, "timestamp": "2025-09-02 14:14:47.601589", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:47.654137", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.21977733075618744, "timestamp": "2025-09-02 14:14:47.656534", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.709776", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.13368260860443115, "timestamp": "2025-09-02 14:14:47.712264", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.765988", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.2912275493144989, "timestamp": "2025-09-02 14:14:47.768355", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:47.821738", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.13125886023044586, "timestamp": "2025-09-02 14:14:47.827991", "step": 1440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:49.702431", "step": 1440, "epoch": 2 }, { "type": "pplx", "content": 7104.954970894749, "timestamp": "2025-09-02 14:14:49.704486", "step": 1440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1440", "timestamp": "2025-09-02 14:14:50.062175", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.119407", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.20343191921710968, "timestamp": "2025-09-02 14:14:50.121608", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:50.176670", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.16102007031440735, "timestamp": "2025-09-02 14:14:50.178943", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.232280", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.21684084832668304, "timestamp": "2025-09-02 14:14:50.234828", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:50.287872", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.19188861548900604, "timestamp": "2025-09-02 14:14:50.293996", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.345926", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.2105085849761963, "timestamp": "2025-09-02 14:14:50.348152", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:50.401084", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.19422626495361328, "timestamp": "2025-09-02 14:14:50.403332", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.456135", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.18853774666786194, "timestamp": "2025-09-02 14:14:50.458282", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:50.513404", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.30969542264938354, "timestamp": "2025-09-02 14:14:50.519818", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:50.572677", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.18706829845905304, "timestamp": "2025-09-02 14:14:50.574932", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.628101", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.061881911009550095, "timestamp": "2025-09-02 14:14:50.630524", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.684635", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.11770352721214294, "timestamp": "2025-09-02 14:14:50.692718", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:50.748262", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.16158336400985718, "timestamp": "2025-09-02 14:14:50.754629", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:50.809467", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.2956713140010834, "timestamp": "2025-09-02 14:14:50.812342", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.870299", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.2542232275009155, "timestamp": "2025-09-02 14:14:50.872425", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.928886", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.1933789700269699, "timestamp": "2025-09-02 14:14:50.931249", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:50.986331", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.10468480736017227, "timestamp": "2025-09-02 14:14:50.993121", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:51.047853", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.2542237639427185, "timestamp": "2025-09-02 14:14:51.051363", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:51.106495", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.16588811576366425, "timestamp": "2025-09-02 14:14:51.108725", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:51.164203", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.11080075800418854, "timestamp": "2025-09-02 14:14:51.166962", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:51.224745", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.14683979749679565, "timestamp": "2025-09-02 14:14:51.231430", "step": 1460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:53.118897", "step": 1460, "epoch": 2 }, { "type": "pplx", "content": 7225.694470739731, "timestamp": "2025-09-02 14:14:53.121035", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.174875", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.1770847588777542, "timestamp": "2025-09-02 14:14:53.177800", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:53.231729", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.10822723805904388, "timestamp": "2025-09-02 14:14:53.233923", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:53.287921", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.2046375423669815, "timestamp": "2025-09-02 14:14:53.289816", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.343274", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.19966119527816772, "timestamp": "2025-09-02 14:14:53.349149", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.403852", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.11477626860141754, "timestamp": "2025-09-02 14:14:53.407479", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.463301", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.09784653037786484, "timestamp": "2025-09-02 14:14:53.465369", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.518673", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.1455068290233612, "timestamp": "2025-09-02 14:14:53.520720", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.573938", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.1972431093454361, "timestamp": "2025-09-02 14:14:53.580012", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.634329", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.1581440269947052, "timestamp": "2025-09-02 14:14:53.638737", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.691995", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.19445009529590607, "timestamp": "2025-09-02 14:14:53.694493", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.748035", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.1039266511797905, "timestamp": "2025-09-02 14:14:53.750188", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.804180", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.11611234396696091, "timestamp": "2025-09-02 14:14:53.810543", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.863160", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.10993252694606781, "timestamp": "2025-09-02 14:14:53.865725", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:53.919307", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.168858602643013, "timestamp": "2025-09-02 14:14:53.921722", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:53.975216", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.20116449892520905, "timestamp": "2025-09-02 14:14:53.976951", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:54.030313", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.2531073987483978, "timestamp": "2025-09-02 14:14:54.036262", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:54.089140", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.15401144325733185, "timestamp": "2025-09-02 14:14:54.091395", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:54.144890", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.1246110051870346, "timestamp": "2025-09-02 14:14:54.146859", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:54.199674", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.12555015087127686, "timestamp": "2025-09-02 14:14:54.201534", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:54.254572", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.27726444602012634, "timestamp": "2025-09-02 14:14:54.260544", "step": 1480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:56.138919", "step": 1480, "epoch": 2 }, { "type": "pplx", "content": 7262.662592831741, "timestamp": "2025-09-02 14:14:56.141377", "step": 1480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1480", "timestamp": "2025-09-02 14:14:56.500951", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.557621", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.11940862983465195, "timestamp": "2025-09-02 14:14:56.560041", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.613813", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.18493741750717163, "timestamp": "2025-09-02 14:14:56.615936", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.669588", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.12668360769748688, "timestamp": "2025-09-02 14:14:56.671925", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.724600", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.1912524700164795, "timestamp": "2025-09-02 14:14:56.730927", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.783759", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.40379828214645386, "timestamp": "2025-09-02 14:14:56.786110", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:56.841155", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.17911489307880402, "timestamp": "2025-09-02 14:14:56.843360", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:56.897568", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.15467512607574463, "timestamp": "2025-09-02 14:14:56.899990", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:56.953802", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.2238946557044983, "timestamp": "2025-09-02 14:14:56.959758", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.013017", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.16450370848178864, "timestamp": "2025-09-02 14:14:57.015111", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.068088", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.1318041831254959, "timestamp": "2025-09-02 14:14:57.070676", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.123588", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.19983157515525818, "timestamp": "2025-09-02 14:14:57.127328", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.180134", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.07072880119085312, "timestamp": "2025-09-02 14:14:57.186210", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.238487", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.1893046647310257, "timestamp": "2025-09-02 14:14:57.240656", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.293423", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.22393158078193665, "timestamp": "2025-09-02 14:14:57.295568", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.348374", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.2562890946865082, "timestamp": "2025-09-02 14:14:57.350379", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:57.405321", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.17311297357082367, "timestamp": "2025-09-02 14:14:57.411286", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:57.465095", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.15814988315105438, "timestamp": "2025-09-02 14:14:57.467342", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.520228", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.1596711426973343, "timestamp": "2025-09-02 14:14:57.522405", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.576489", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.1815585345029831, "timestamp": "2025-09-02 14:14:57.578804", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:57.631698", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.12765292823314667, "timestamp": "2025-09-02 14:14:57.638233", "step": 1500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:14:59.529801", "step": 1500, "epoch": 2 }, { "type": "pplx", "content": 7273.230701217713, "timestamp": "2025-09-02 14:14:59.531919", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:59.585032", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.1286899447441101, "timestamp": "2025-09-02 14:14:59.587065", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:59.641188", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.11897411197423935, "timestamp": "2025-09-02 14:14:59.643274", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:59.696165", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.19318848848342896, "timestamp": "2025-09-02 14:14:59.698362", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:59.751298", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.12096086889505386, "timestamp": "2025-09-02 14:14:59.757498", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:14:59.810173", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.1549685150384903, "timestamp": "2025-09-02 14:14:59.812684", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:59.866130", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.19899596273899078, "timestamp": "2025-09-02 14:14:59.868874", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:14:59.922151", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.1648457646369934, "timestamp": "2025-09-02 14:14:59.924309", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:14:59.977594", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.24824465811252594, "timestamp": "2025-09-02 14:14:59.983542", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:00.037440", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.0756700336933136, "timestamp": "2025-09-02 14:15:00.039715", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.092436", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.18836697936058044, "timestamp": "2025-09-02 14:15:00.094733", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.147101", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.2012166529893875, "timestamp": "2025-09-02 14:15:00.149426", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:00.202749", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.12765319645404816, "timestamp": "2025-09-02 14:15:00.208540", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:00.261697", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.20101197063922882, "timestamp": "2025-09-02 14:15:00.264035", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:00.317516", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.20844361186027527, "timestamp": "2025-09-02 14:15:00.319795", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:00.373555", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.2290911078453064, "timestamp": "2025-09-02 14:15:00.375723", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.428709", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.165852889418602, "timestamp": "2025-09-02 14:15:00.434822", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.487663", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.3889223337173462, "timestamp": "2025-09-02 14:15:00.489959", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.544186", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.16968119144439697, "timestamp": "2025-09-02 14:15:00.546486", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.599491", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.1884487122297287, "timestamp": "2025-09-02 14:15:00.601814", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:00.654975", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.14565923810005188, "timestamp": "2025-09-02 14:15:00.665160", "step": 1520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:02.552669", "step": 1520, "epoch": 2 }, { "type": "pplx", "content": 7243.4197591612765, "timestamp": "2025-09-02 14:15:02.555124", "step": 1520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1520", "timestamp": "2025-09-02 14:15:02.907908", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:02.963858", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.11342111229896545, "timestamp": "2025-09-02 14:15:02.966286", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:03.020290", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.23216646909713745, "timestamp": "2025-09-02 14:15:03.022559", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.077281", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.32464924454689026, "timestamp": "2025-09-02 14:15:03.079713", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.132674", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.265799880027771, "timestamp": "2025-09-02 14:15:03.138927", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.191240", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.19903908669948578, "timestamp": "2025-09-02 14:15:03.193497", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:03.247587", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.19406777620315552, "timestamp": "2025-09-02 14:15:03.249476", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.302645", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.2234082967042923, "timestamp": "2025-09-02 14:15:03.306966", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.359982", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.18382270634174347, "timestamp": "2025-09-02 14:15:03.365912", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.418189", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.1330796331167221, "timestamp": "2025-09-02 14:15:03.420524", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.473493", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.19670610129833221, "timestamp": "2025-09-02 14:15:03.475759", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.528838", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.2293892204761505, "timestamp": "2025-09-02 14:15:03.531167", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:03.585892", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.12804216146469116, "timestamp": "2025-09-02 14:15:03.591563", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.643508", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.222395122051239, "timestamp": "2025-09-02 14:15:03.645814", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.698955", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.27419352531433105, "timestamp": "2025-09-02 14:15:03.701043", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:03.754800", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.2926202714443207, "timestamp": "2025-09-02 14:15:03.757000", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.810310", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.23972547054290771, "timestamp": "2025-09-02 14:15:03.817250", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.870016", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.17130322754383087, "timestamp": "2025-09-02 14:15:03.872523", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.926397", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.1482105851173401, "timestamp": "2025-09-02 14:15:03.930298", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:03.989589", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.21886299550533295, "timestamp": "2025-09-02 14:15:03.995463", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:04.057497", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.15087607502937317, "timestamp": "2025-09-02 14:15:04.063978", "step": 1540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:05.977623", "step": 1540, "epoch": 2 }, { "type": "pplx", "content": 7105.059260903607, "timestamp": "2025-09-02 14:15:05.979741", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.034013", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.24125409126281738, "timestamp": "2025-09-02 14:15:06.036175", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.090136", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.2455378770828247, "timestamp": "2025-09-02 14:15:06.092123", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:06.145334", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.1780426800251007, "timestamp": "2025-09-02 14:15:06.147197", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.200099", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.199091836810112, "timestamp": "2025-09-02 14:15:06.205890", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.261087", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.2381470948457718, "timestamp": "2025-09-02 14:15:06.263311", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.318283", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.24724140763282776, "timestamp": "2025-09-02 14:15:06.326320", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.395656", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.1840948909521103, "timestamp": "2025-09-02 14:15:06.398230", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:06.454348", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.1839980036020279, "timestamp": "2025-09-02 14:15:06.460429", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.512809", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.09325284510850906, "timestamp": "2025-09-02 14:15:06.515047", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:06.568500", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.23089973628520966, "timestamp": "2025-09-02 14:15:06.570782", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:06.623763", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.2320786565542221, "timestamp": "2025-09-02 14:15:06.625989", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:06.679169", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.16476991772651672, "timestamp": "2025-09-02 14:15:06.686586", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.740124", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.35730433464050293, "timestamp": "2025-09-02 14:15:06.742750", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:06.796338", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.1647089272737503, "timestamp": "2025-09-02 14:15:06.799257", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.852851", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.08393990993499756, "timestamp": "2025-09-02 14:15:06.855182", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:06.908694", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.2546108067035675, "timestamp": "2025-09-02 14:15:06.914681", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:06.966693", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.1156020537018776, "timestamp": "2025-09-02 14:15:06.968998", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:07.023709", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.173190176486969, "timestamp": "2025-09-02 14:15:07.026072", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:07.079019", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.16669915616512299, "timestamp": "2025-09-02 14:15:07.080842", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:07.133596", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.15022210776805878, "timestamp": "2025-09-02 14:15:07.139024", "step": 1560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:09.015847", "step": 1560, "epoch": 2 }, { "type": "pplx", "content": 7106.071903621284, "timestamp": "2025-09-02 14:15:09.022143", "step": 1560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1560", "timestamp": "2025-09-02 14:15:09.416114", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:09.475664", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.20024807751178741, "timestamp": "2025-09-02 14:15:09.480724", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:09.538162", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.19442802667617798, "timestamp": "2025-09-02 14:15:09.543754", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.600519", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.25025758147239685, "timestamp": "2025-09-02 14:15:09.608788", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.662569", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.2411840856075287, "timestamp": "2025-09-02 14:15:09.670245", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.724560", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.09095103293657303, "timestamp": "2025-09-02 14:15:09.739328", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.804421", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.2548292279243469, "timestamp": "2025-09-02 14:15:09.808198", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.863544", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.31083497405052185, "timestamp": "2025-09-02 14:15:09.867340", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.927083", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.15349215269088745, "timestamp": "2025-09-02 14:15:09.935697", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:09.989931", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.18469861149787903, "timestamp": "2025-09-02 14:15:09.994007", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.050094", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.22245927155017853, "timestamp": "2025-09-02 14:15:10.061440", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.130975", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.1581275761127472, "timestamp": "2025-09-02 14:15:10.150377", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:10.214462", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.2463005930185318, "timestamp": "2025-09-02 14:15:10.226753", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.288336", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.12329984456300735, "timestamp": "2025-09-02 14:15:10.291991", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.345738", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.1954256296157837, "timestamp": "2025-09-02 14:15:10.348928", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.404852", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.12687112390995026, "timestamp": "2025-09-02 14:15:10.410199", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.465724", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.19634678959846497, "timestamp": "2025-09-02 14:15:10.474204", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.528290", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.1641768515110016, "timestamp": "2025-09-02 14:15:10.534420", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.587329", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.12585335969924927, "timestamp": "2025-09-02 14:15:10.589478", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.642031", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.2902316749095917, "timestamp": "2025-09-02 14:15:10.646169", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:10.700550", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.25188907980918884, "timestamp": "2025-09-02 14:15:10.712559", "step": 1580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:12.586704", "step": 1580, "epoch": 2 }, { "type": "pplx", "content": 7090.120771093685, "timestamp": "2025-09-02 14:15:12.589374", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:12.642613", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.1740882396697998, "timestamp": "2025-09-02 14:15:12.646364", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:12.704234", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.2840808629989624, "timestamp": "2025-09-02 14:15:12.709058", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:12.763690", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.2103690505027771, "timestamp": "2025-09-02 14:15:12.766885", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:12.822589", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.10891769826412201, "timestamp": "2025-09-02 14:15:12.829389", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:12.886141", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.1516328603029251, "timestamp": "2025-09-02 14:15:12.893236", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:12.950117", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.31081685423851013, "timestamp": "2025-09-02 14:15:12.954970", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:13.011527", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.122956782579422, "timestamp": "2025-09-02 14:15:13.018299", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.075415", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.1033792644739151, "timestamp": "2025-09-02 14:15:13.087936", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.144192", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.19543009996414185, "timestamp": "2025-09-02 14:15:13.151796", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.208128", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.1304125189781189, "timestamp": "2025-09-02 14:15:13.212883", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:13.267724", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.20495104789733887, "timestamp": "2025-09-02 14:15:13.281469", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:13.346663", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.07823386043310165, "timestamp": "2025-09-02 14:15:13.353699", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.408615", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.3362092077732086, "timestamp": "2025-09-02 14:15:13.411287", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.464511", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.2595943510532379, "timestamp": "2025-09-02 14:15:13.467201", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.522269", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.2369784563779831, "timestamp": "2025-09-02 14:15:13.524975", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:13.579085", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.15497809648513794, "timestamp": "2025-09-02 14:15:13.590114", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:13.648395", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.15536612272262573, "timestamp": "2025-09-02 14:15:13.650721", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.703534", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.10165808349847794, "timestamp": "2025-09-02 14:15:13.707496", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:13.761727", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.2126156985759735, "timestamp": "2025-09-02 14:15:13.776768", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:13.844556", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.12469130009412766, "timestamp": "2025-09-02 14:15:13.865556", "step": 1600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:15.726308", "step": 1600, "epoch": 2 }, { "type": "pplx", "content": 7095.413891644909, "timestamp": "2025-09-02 14:15:15.728740", "step": 1600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1600", "timestamp": "2025-09-02 14:15:16.091111", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.147412", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.23795343935489655, "timestamp": "2025-09-02 14:15:16.149559", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:16.203260", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.12701527774333954, "timestamp": "2025-09-02 14:15:16.205952", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.258925", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.11535976082086563, "timestamp": "2025-09-02 14:15:16.261042", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.314561", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.14093536138534546, "timestamp": "2025-09-02 14:15:16.320444", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.371982", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.19321244955062866, "timestamp": "2025-09-02 14:15:16.374202", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.426712", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.30391135811805725, "timestamp": "2025-09-02 14:15:16.429136", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.481364", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.21403028070926666, "timestamp": "2025-09-02 14:15:16.483457", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.536170", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.21783512830734253, "timestamp": "2025-09-02 14:15:16.541873", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.593919", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.22770410776138306, "timestamp": "2025-09-02 14:15:16.596227", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.650121", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.2492663711309433, "timestamp": "2025-09-02 14:15:16.652464", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.705066", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.12716512382030487, "timestamp": "2025-09-02 14:15:16.707413", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.760320", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.1844222992658615, "timestamp": "2025-09-02 14:15:16.766484", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:16.819463", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.12449739873409271, "timestamp": "2025-09-02 14:15:16.822706", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:16.877109", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.2172544002532959, "timestamp": "2025-09-02 14:15:16.879860", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:16.932792", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.23007997870445251, "timestamp": "2025-09-02 14:15:16.935299", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:16.988482", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.14505217969417572, "timestamp": "2025-09-02 14:15:16.994245", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:17.046630", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.09558647871017456, "timestamp": "2025-09-02 14:15:17.049473", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:17.102537", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.10997369885444641, "timestamp": "2025-09-02 14:15:17.104816", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:17.159005", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.12362290918827057, "timestamp": "2025-09-02 14:15:17.161179", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:17.214718", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.21529777348041534, "timestamp": "2025-09-02 14:15:17.220585", "step": 1620, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:19.088565", "step": 1620, "epoch": 2 }, { "type": "pplx", "content": 7114.1189254789115, "timestamp": "2025-09-02 14:15:19.090599", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.143339", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.26486802101135254, "timestamp": "2025-09-02 14:15:19.145417", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.198672", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.20550940930843353, "timestamp": "2025-09-02 14:15:19.200529", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.253288", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.2211744487285614, "timestamp": "2025-09-02 14:15:19.256340", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.309089", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.09456345438957214, "timestamp": "2025-09-02 14:15:19.315110", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:19.367664", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.240533247590065, "timestamp": "2025-09-02 14:15:19.369920", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.423703", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.22233422100543976, "timestamp": "2025-09-02 14:15:19.425907", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.478869", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.15739883482456207, "timestamp": "2025-09-02 14:15:19.481171", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:19.534316", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.3311193883419037, "timestamp": "2025-09-02 14:15:19.541928", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.594145", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.131230428814888, "timestamp": "2025-09-02 14:15:19.596464", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.650195", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.11819442361593246, "timestamp": "2025-09-02 14:15:19.652367", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.704800", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.28455740213394165, "timestamp": "2025-09-02 14:15:19.707145", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.760449", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.17441265285015106, "timestamp": "2025-09-02 14:15:19.766700", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.819199", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.32945841550827026, "timestamp": "2025-09-02 14:15:19.821567", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:19.874599", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.16343897581100464, "timestamp": "2025-09-02 14:15:19.876879", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.929596", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.2734418511390686, "timestamp": "2025-09-02 14:15:19.931883", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:19.984598", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.2731206715106964, "timestamp": "2025-09-02 14:15:19.990488", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:20.042564", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.05348249524831772, "timestamp": "2025-09-02 14:15:20.045508", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:20.099245", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.2739405035972595, "timestamp": "2025-09-02 14:15:20.101459", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:20.154447", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.14740179479122162, "timestamp": "2025-09-02 14:15:20.156714", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:20.209242", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.08692299574613571, "timestamp": "2025-09-02 14:15:20.214994", "step": 1640, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:22.078892", "step": 1640, "epoch": 2 }, { "type": "pplx", "content": 7140.335474865979, "timestamp": "2025-09-02 14:15:22.081213", "step": 1640, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1640", "timestamp": "2025-09-02 14:15:22.440604", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.496044", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.13427968323230743, "timestamp": "2025-09-02 14:15:22.498118", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.551584", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.19201700389385223, "timestamp": "2025-09-02 14:15:22.553867", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:22.607104", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.15202665328979492, "timestamp": "2025-09-02 14:15:22.609389", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.662741", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.21047675609588623, "timestamp": "2025-09-02 14:15:22.668961", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.722620", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.19752009212970734, "timestamp": "2025-09-02 14:15:22.724940", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.777716", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.1422591656446457, "timestamp": "2025-09-02 14:15:22.780711", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:22.833537", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.17718879878520966, "timestamp": "2025-09-02 14:15:22.835735", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.888492", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.2547493875026703, "timestamp": "2025-09-02 14:15:22.894958", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:22.947764", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.16405217349529266, "timestamp": "2025-09-02 14:15:22.950947", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:23.004181", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.26531341671943665, "timestamp": "2025-09-02 14:15:23.006530", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.059260", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.0888519361615181, "timestamp": "2025-09-02 14:15:23.061664", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:23.114513", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.22230902314186096, "timestamp": "2025-09-02 14:15:23.120495", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:23.172869", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.1864284723997116, "timestamp": "2025-09-02 14:15:23.174985", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.227428", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.1672561764717102, "timestamp": "2025-09-02 14:15:23.229840", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.282096", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.11104092746973038, "timestamp": "2025-09-02 14:15:23.284372", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:23.338118", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.2998628318309784, "timestamp": "2025-09-02 14:15:23.344670", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.397064", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.14264217019081116, "timestamp": "2025-09-02 14:15:23.399078", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.451503", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.15764345228672028, "timestamp": "2025-09-02 14:15:23.453653", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:23.506419", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.15628591179847717, "timestamp": "2025-09-02 14:15:23.508606", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:23.560757", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.17300426959991455, "timestamp": "2025-09-02 14:15:23.566614", "step": 1660, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:25.426923", "step": 1660, "epoch": 2 }, { "type": "pplx", "content": 7188.760277495545, "timestamp": "2025-09-02 14:15:25.429188", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:25.481856", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.1729632169008255, "timestamp": "2025-09-02 14:15:25.484126", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:25.537798", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.30053210258483887, "timestamp": "2025-09-02 14:15:25.540592", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:25.593567", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.4345700144767761, "timestamp": "2025-09-02 14:15:25.595720", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:25.648669", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.09475085139274597, "timestamp": "2025-09-02 14:15:25.654982", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:25.707289", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.3434130549430847, "timestamp": "2025-09-02 14:15:25.709699", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:25.763394", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.09037952125072479, "timestamp": "2025-09-02 14:15:25.765698", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:25.818496", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.20592756569385529, "timestamp": "2025-09-02 14:15:25.820608", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:25.873528", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.17029623687267303, "timestamp": "2025-09-02 14:15:25.879324", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:25.932765", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.35758185386657715, "timestamp": "2025-09-02 14:15:25.934983", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:25.987854", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.0773768275976181, "timestamp": "2025-09-02 14:15:25.989987", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.042813", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.36584627628326416, "timestamp": "2025-09-02 14:15:26.044980", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.097956", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.1521451324224472, "timestamp": "2025-09-02 14:15:26.103798", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.155626", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.10559500008821487, "timestamp": "2025-09-02 14:15:26.157838", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.210425", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.269221693277359, "timestamp": "2025-09-02 14:15:26.212849", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:26.266316", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.21161945164203644, "timestamp": "2025-09-02 14:15:26.268363", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.321521", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.1561059057712555, "timestamp": "2025-09-02 14:15:26.328405", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.380828", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.11514154821634293, "timestamp": "2025-09-02 14:15:26.383078", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.436299", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.20675429701805115, "timestamp": "2025-09-02 14:15:26.438409", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:26.491686", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.08985036611557007, "timestamp": "2025-09-02 14:15:26.494111", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:26.548191", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.24079148471355438, "timestamp": "2025-09-02 14:15:26.554401", "step": 1680, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:28.413793", "step": 1680, "epoch": 2 }, { "type": "pplx", "content": 7263.026405287225, "timestamp": "2025-09-02 14:15:28.416882", "step": 1680, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1680", "timestamp": "2025-09-02 14:15:28.775924", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:28.833037", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.12359210848808289, "timestamp": "2025-09-02 14:15:28.835508", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:28.891087", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.1997320055961609, "timestamp": "2025-09-02 14:15:28.893702", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:28.947656", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.21237629652023315, "timestamp": "2025-09-02 14:15:28.949887", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.004313", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.16593314707279205, "timestamp": "2025-09-02 14:15:29.010711", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.063823", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.16863465309143066, "timestamp": "2025-09-02 14:15:29.066090", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:29.120078", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.3313466012477875, "timestamp": "2025-09-02 14:15:29.122396", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:29.177501", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.1886720210313797, "timestamp": "2025-09-02 14:15:29.179928", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.232811", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.2135927826166153, "timestamp": "2025-09-02 14:15:29.239070", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:29.291964", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.26523062586784363, "timestamp": "2025-09-02 14:15:29.294395", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.347657", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.08137743175029755, "timestamp": "2025-09-02 14:15:29.350020", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:29.403477", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.08400006592273712, "timestamp": "2025-09-02 14:15:29.405843", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.458704", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.15945863723754883, "timestamp": "2025-09-02 14:15:29.464881", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.517211", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.14274975657463074, "timestamp": "2025-09-02 14:15:29.519722", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.572479", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.12938006222248077, "timestamp": "2025-09-02 14:15:29.574811", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.627705", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.1999073177576065, "timestamp": "2025-09-02 14:15:29.630204", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.683881", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.22561395168304443, "timestamp": "2025-09-02 14:15:29.690475", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:29.743651", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.3293859362602234, "timestamp": "2025-09-02 14:15:29.745919", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.798788", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.2910382151603699, "timestamp": "2025-09-02 14:15:29.801278", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.854361", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.12906749546527863, "timestamp": "2025-09-02 14:15:29.856724", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:29.910116", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.2212066799402237, "timestamp": "2025-09-02 14:15:29.916383", "step": 1700, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:31.781715", "step": 1700, "epoch": 2 }, { "type": "pplx", "content": 7357.870436933247, "timestamp": "2025-09-02 14:15:31.783964", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:31.837183", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.19001856446266174, "timestamp": "2025-09-02 14:15:31.839707", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:31.893313", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.22698694467544556, "timestamp": "2025-09-02 14:15:31.896707", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:31.950183", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.10151347517967224, "timestamp": "2025-09-02 14:15:31.952725", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:32.007640", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.2474299520254135, "timestamp": "2025-09-02 14:15:32.013799", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.066518", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.2298295646905899, "timestamp": "2025-09-02 14:15:32.068823", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.121665", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.1723732352256775, "timestamp": "2025-09-02 14:15:32.124156", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:32.177968", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.13393743336200714, "timestamp": "2025-09-02 14:15:32.180104", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:32.233485", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.0827023833990097, "timestamp": "2025-09-02 14:15:32.239389", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.293098", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.17119745910167694, "timestamp": "2025-09-02 14:15:32.296030", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.349234", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.2828964591026306, "timestamp": "2025-09-02 14:15:32.351742", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:32.404776", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.19482100009918213, "timestamp": "2025-09-02 14:15:32.407084", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.459719", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.10923542082309723, "timestamp": "2025-09-02 14:15:32.465926", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.518907", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.21167612075805664, "timestamp": "2025-09-02 14:15:32.521262", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.574633", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.10867512971162796, "timestamp": "2025-09-02 14:15:32.576982", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.629754", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.15462447702884674, "timestamp": "2025-09-02 14:15:32.631986", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.684562", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.06639353185892105, "timestamp": "2025-09-02 14:15:32.690583", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:32.743087", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.17519257962703705, "timestamp": "2025-09-02 14:15:32.745732", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.799592", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.09706693142652512, "timestamp": "2025-09-02 14:15:32.801886", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.855849", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.17984811961650848, "timestamp": "2025-09-02 14:15:32.858394", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:32.911461", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.2735134959220886, "timestamp": "2025-09-02 14:15:32.917841", "step": 1720, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:34.783089", "step": 1720, "epoch": 2 }, { "type": "pplx", "content": 7528.436307194737, "timestamp": "2025-09-02 14:15:34.785723", "step": 1720, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1720", "timestamp": "2025-09-02 14:15:35.144585", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.201784", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.12885893881320953, "timestamp": "2025-09-02 14:15:35.203699", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.257298", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.23672454059123993, "timestamp": "2025-09-02 14:15:35.259755", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.313807", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.13505034148693085, "timestamp": "2025-09-02 14:15:35.316410", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.369577", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.199506476521492, "timestamp": "2025-09-02 14:15:35.375843", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.428785", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.15731976926326752, "timestamp": "2025-09-02 14:15:35.431106", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.483943", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.2577368915081024, "timestamp": "2025-09-02 14:15:35.486175", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:35.539195", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.18751418590545654, "timestamp": "2025-09-02 14:15:35.542222", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.595622", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.21542079746723175, "timestamp": "2025-09-02 14:15:35.601861", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.654427", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.08581150323152542, "timestamp": "2025-09-02 14:15:35.656583", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:35.710100", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.22542671859264374, "timestamp": "2025-09-02 14:15:35.712336", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:35.766026", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.26255002617836, "timestamp": "2025-09-02 14:15:35.768281", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.820814", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.16237226128578186, "timestamp": "2025-09-02 14:15:35.826954", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.879593", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.17486095428466797, "timestamp": "2025-09-02 14:15:35.881751", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:35.934611", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.12100626528263092, "timestamp": "2025-09-02 14:15:35.937113", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:35.990377", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.17906689643859863, "timestamp": "2025-09-02 14:15:35.992692", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:36.046994", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.1971115916967392, "timestamp": "2025-09-02 14:15:36.053911", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:36.107031", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.23668843507766724, "timestamp": "2025-09-02 14:15:36.109916", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:36.163539", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.07848786562681198, "timestamp": "2025-09-02 14:15:36.165940", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:36.219060", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.13556626439094543, "timestamp": "2025-09-02 14:15:36.221465", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:36.274494", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.09540779143571854, "timestamp": "2025-09-02 14:15:36.280100", "step": 1740, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:38.160458", "step": 1740, "epoch": 2 }, { "type": "pplx", "content": 7633.244061691635, "timestamp": "2025-09-02 14:15:38.162585", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.215013", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.29300329089164734, "timestamp": "2025-09-02 14:15:38.217165", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:38.271058", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.26189836859703064, "timestamp": "2025-09-02 14:15:38.273284", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.327100", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.17323113977909088, "timestamp": "2025-09-02 14:15:38.329425", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.382893", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.1450425684452057, "timestamp": "2025-09-02 14:15:38.389802", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:38.442406", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.12607303261756897, "timestamp": "2025-09-02 14:15:38.444706", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.497693", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.18408121168613434, "timestamp": "2025-09-02 14:15:38.500172", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.553611", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.1688344031572342, "timestamp": "2025-09-02 14:15:38.560953", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.624571", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.17954415082931519, "timestamp": "2025-09-02 14:15:38.630566", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.682504", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.2875926196575165, "timestamp": "2025-09-02 14:15:38.684715", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.737253", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.2102905958890915, "timestamp": "2025-09-02 14:15:38.739444", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:38.792872", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.24810941517353058, "timestamp": "2025-09-02 14:15:38.796948", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:38.850728", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.13266266882419586, "timestamp": "2025-09-02 14:15:38.856558", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:15:38.908814", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.15738427639007568, "timestamp": "2025-09-02 14:15:38.912000", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:38.965570", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.16937071084976196, "timestamp": "2025-09-02 14:15:38.967899", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.020558", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.2772127687931061, "timestamp": "2025-09-02 14:15:39.022987", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.075788", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.5170446634292603, "timestamp": "2025-09-02 14:15:39.081392", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.136757", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.22127649188041687, "timestamp": "2025-09-02 14:15:39.139024", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.191834", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.08185163885354996, "timestamp": "2025-09-02 14:15:39.194276", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.247318", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.1635594516992569, "timestamp": "2025-09-02 14:15:39.249668", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:39.302601", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.15450145304203033, "timestamp": "2025-09-02 14:15:39.308314", "step": 1760, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:41.163195", "step": 1760, "epoch": 2 }, { "type": "pplx", "content": 7753.9752662859655, "timestamp": "2025-09-02 14:15:41.165406", "step": 1760, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1760", "timestamp": "2025-09-02 14:15:41.524167", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.582722", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.11425971984863281, "timestamp": "2025-09-02 14:15:41.585249", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.639740", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.10684569180011749, "timestamp": "2025-09-02 14:15:41.642362", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.696331", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.28724542260169983, "timestamp": "2025-09-02 14:15:41.698538", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.751641", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.08131976425647736, "timestamp": "2025-09-02 14:15:41.757830", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.809927", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.1067056655883789, "timestamp": "2025-09-02 14:15:41.812196", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:41.865545", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.09689836949110031, "timestamp": "2025-09-02 14:15:41.867948", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.920567", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.3055119216442108, "timestamp": "2025-09-02 14:15:41.924223", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:41.977118", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.1067226380109787, "timestamp": "2025-09-02 14:15:41.983110", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.035350", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.14087556302547455, "timestamp": "2025-09-02 14:15:42.037702", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.090082", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.15882445871829987, "timestamp": "2025-09-02 14:15:42.092466", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:42.145333", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.2554975748062134, "timestamp": "2025-09-02 14:15:42.147660", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:42.200498", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.11007653921842575, "timestamp": "2025-09-02 14:15:42.206723", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:42.259807", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.12223494052886963, "timestamp": "2025-09-02 14:15:42.262145", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:42.315164", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.13094735145568848, "timestamp": "2025-09-02 14:15:42.317524", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.370586", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.05951283127069473, "timestamp": "2025-09-02 14:15:42.373188", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.426864", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.29583948850631714, "timestamp": "2025-09-02 14:15:42.433880", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:42.486567", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.08479168266057968, "timestamp": "2025-09-02 14:15:42.488818", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.541804", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.26028475165367126, "timestamp": "2025-09-02 14:15:42.544160", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.597603", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.15011443197727203, "timestamp": "2025-09-02 14:15:42.600117", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:42.653235", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.14348213374614716, "timestamp": "2025-09-02 14:15:42.659293", "step": 1780, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:44.524599", "step": 1780, "epoch": 2 }, { "type": "pplx", "content": 7706.735922764698, "timestamp": "2025-09-02 14:15:44.526961", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:44.579916", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.1557232290506363, "timestamp": "2025-09-02 14:15:44.582208", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.635480", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.21525073051452637, "timestamp": "2025-09-02 14:15:44.637826", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.690870", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.10086548328399658, "timestamp": "2025-09-02 14:15:44.692900", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.745733", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.11167874187231064, "timestamp": "2025-09-02 14:15:44.751905", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.806231", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.17378968000411987, "timestamp": "2025-09-02 14:15:44.812490", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.872401", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.22591815888881683, "timestamp": "2025-09-02 14:15:44.874925", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:44.928248", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.2760980427265167, "timestamp": "2025-09-02 14:15:44.930831", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:44.984498", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.27463850378990173, "timestamp": "2025-09-02 14:15:44.990664", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.042730", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.14035804569721222, "timestamp": "2025-09-02 14:15:45.045425", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.098488", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.10468107461929321, "timestamp": "2025-09-02 14:15:45.100625", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.153399", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.1720709204673767, "timestamp": "2025-09-02 14:15:45.156049", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.210366", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.23642684519290924, "timestamp": "2025-09-02 14:15:45.216828", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:45.269081", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.15846972167491913, "timestamp": "2025-09-02 14:15:45.271544", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.324040", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.11340558528900146, "timestamp": "2025-09-02 14:15:45.326495", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.379277", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.21715334057807922, "timestamp": "2025-09-02 14:15:45.381712", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:45.434543", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.16832461953163147, "timestamp": "2025-09-02 14:15:45.440558", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.492909", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.19561609625816345, "timestamp": "2025-09-02 14:15:45.494931", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.547635", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.21970973908901215, "timestamp": "2025-09-02 14:15:45.549790", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.602291", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.14352989196777344, "timestamp": "2025-09-02 14:15:45.604636", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:45.657238", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.22365279495716095, "timestamp": "2025-09-02 14:15:45.663091", "step": 1800, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:47.520868", "step": 1800, "epoch": 2 }, { "type": "pplx", "content": 7817.467518401361, "timestamp": "2025-09-02 14:15:47.523553", "step": 1800, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1800", "timestamp": "2025-09-02 14:15:47.894753", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:47.950779", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.17146605253219604, "timestamp": "2025-09-02 14:15:47.953088", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.011825", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.35452309250831604, "timestamp": "2025-09-02 14:15:48.014112", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.067122", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.24935168027877808, "timestamp": "2025-09-02 14:15:48.069183", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.121953", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.14707744121551514, "timestamp": "2025-09-02 14:15:48.128669", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.181305", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.22790847718715668, "timestamp": "2025-09-02 14:15:48.183840", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.236972", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.23189443349838257, "timestamp": "2025-09-02 14:15:48.239229", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.292687", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.15542644262313843, "timestamp": "2025-09-02 14:15:48.295167", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.349600", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.40288785099983215, "timestamp": "2025-09-02 14:15:48.355788", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:48.408404", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.3291488289833069, "timestamp": "2025-09-02 14:15:48.410553", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.464372", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.25502434372901917, "timestamp": "2025-09-02 14:15:48.466865", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.520296", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.23031941056251526, "timestamp": "2025-09-02 14:15:48.522609", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.575531", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.14826345443725586, "timestamp": "2025-09-02 14:15:48.581641", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.634537", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.15266039967536926, "timestamp": "2025-09-02 14:15:48.636871", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.689430", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.21934659779071808, "timestamp": "2025-09-02 14:15:48.691875", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.745211", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.21679066121578217, "timestamp": "2025-09-02 14:15:48.747649", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.800600", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.10107144713401794, "timestamp": "2025-09-02 14:15:48.806793", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.860695", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.2022203654050827, "timestamp": "2025-09-02 14:15:48.862956", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.915558", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.24269387125968933, "timestamp": "2025-09-02 14:15:48.917848", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:48.970805", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.13179461658000946, "timestamp": "2025-09-02 14:15:48.973122", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:49.026497", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.1932450532913208, "timestamp": "2025-09-02 14:15:49.032462", "step": 1820, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:50.893051", "step": 1820, "epoch": 2 }, { "type": "pplx", "content": 7859.1842252044235, "timestamp": "2025-09-02 14:15:50.895537", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:50.947434", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.2009473741054535, "timestamp": "2025-09-02 14:15:50.949711", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.003152", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.22282087802886963, "timestamp": "2025-09-02 14:15:51.005354", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.058183", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.13276855647563934, "timestamp": "2025-09-02 14:15:51.061337", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.114030", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.12276124209165573, "timestamp": "2025-09-02 14:15:51.120269", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.172679", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.1798618733882904, "timestamp": "2025-09-02 14:15:51.175341", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.228546", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.2601500451564789, "timestamp": "2025-09-02 14:15:51.231003", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:51.284056", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.30115386843681335, "timestamp": "2025-09-02 14:15:51.286505", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:51.339335", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.19343294203281403, "timestamp": "2025-09-02 14:15:51.345846", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:51.400834", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.12748147547245026, "timestamp": "2025-09-02 14:15:51.403151", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.456287", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.1342405080795288, "timestamp": "2025-09-02 14:15:51.458812", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.511891", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.190438911318779, "timestamp": "2025-09-02 14:15:51.513999", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.567068", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.2173653095960617, "timestamp": "2025-09-02 14:15:51.573202", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.625638", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.23265549540519714, "timestamp": "2025-09-02 14:15:51.628729", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.682317", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.09783462435007095, "timestamp": "2025-09-02 14:15:51.684604", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.737510", "step": 1834, "epoch": 2 }, { "type": "loss", "content": 0.08890616148710251, "timestamp": "2025-09-02 14:15:51.740188", "step": 1835, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.794499", "step": 1835, "epoch": 2 }, { "type": "loss", "content": 0.18724150955677032, "timestamp": "2025-09-02 14:15:51.800576", "step": 1836, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:51.853209", "step": 1836, "epoch": 2 }, { "type": "loss", "content": 0.25752145051956177, "timestamp": "2025-09-02 14:15:51.856296", "step": 1837, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:51.909591", "step": 1837, "epoch": 2 }, { "type": "loss", "content": 0.13435253500938416, "timestamp": "2025-09-02 14:15:51.912191", "step": 1838, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:51.966429", "step": 1838, "epoch": 2 }, { "type": "loss", "content": 0.23838406801223755, "timestamp": "2025-09-02 14:15:51.968866", "step": 1839, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:52.021556", "step": 1839, "epoch": 2 }, { "type": "loss", "content": 0.13995742797851562, "timestamp": "2025-09-02 14:15:52.028099", "step": 1840, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:53.899817", "step": 1840, "epoch": 2 }, { "type": "pplx", "content": 7958.803318342594, "timestamp": "2025-09-02 14:15:53.902380", "step": 1840, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1840", "timestamp": "2025-09-02 14:15:54.266262", "step": 1840, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.323113", "step": 1840, "epoch": 2 }, { "type": "loss", "content": 0.22761762142181396, "timestamp": "2025-09-02 14:15:54.326004", "step": 1841, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.380684", "step": 1841, "epoch": 2 }, { "type": "loss", "content": 0.26244452595710754, "timestamp": "2025-09-02 14:15:54.383198", "step": 1842, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:54.437923", "step": 1842, "epoch": 2 }, { "type": "loss", "content": 0.174306720495224, "timestamp": "2025-09-02 14:15:54.440199", "step": 1843, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.493151", "step": 1843, "epoch": 2 }, { "type": "loss", "content": 0.18067032098770142, "timestamp": "2025-09-02 14:15:54.499876", "step": 1844, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.552208", "step": 1844, "epoch": 2 }, { "type": "loss", "content": 0.1359114795923233, "timestamp": "2025-09-02 14:15:54.554868", "step": 1845, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.607440", "step": 1845, "epoch": 2 }, { "type": "loss", "content": 0.18539489805698395, "timestamp": "2025-09-02 14:15:54.609709", "step": 1846, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:54.662753", "step": 1846, "epoch": 2 }, { "type": "loss", "content": 0.21104641258716583, "timestamp": "2025-09-02 14:15:54.664969", "step": 1847, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:54.717813", "step": 1847, "epoch": 2 }, { "type": "loss", "content": 0.16156761348247528, "timestamp": "2025-09-02 14:15:54.723953", "step": 1848, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.776357", "step": 1848, "epoch": 2 }, { "type": "loss", "content": 0.11158127337694168, "timestamp": "2025-09-02 14:15:54.778869", "step": 1849, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:54.832778", "step": 1849, "epoch": 2 }, { "type": "loss", "content": 0.1222737729549408, "timestamp": "2025-09-02 14:15:54.835104", "step": 1850, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:54.888529", "step": 1850, "epoch": 2 }, { "type": "loss", "content": 0.18295614421367645, "timestamp": "2025-09-02 14:15:54.890805", "step": 1851, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:54.943888", "step": 1851, "epoch": 2 }, { "type": "loss", "content": 0.17823970317840576, "timestamp": "2025-09-02 14:15:54.950570", "step": 1852, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.002839", "step": 1852, "epoch": 2 }, { "type": "loss", "content": 0.12915699183940887, "timestamp": "2025-09-02 14:15:55.005245", "step": 1853, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.058122", "step": 1853, "epoch": 2 }, { "type": "loss", "content": 0.1913820058107376, "timestamp": "2025-09-02 14:15:55.060548", "step": 1854, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.113604", "step": 1854, "epoch": 2 }, { "type": "loss", "content": 0.20231930911540985, "timestamp": "2025-09-02 14:15:55.116004", "step": 1855, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.168758", "step": 1855, "epoch": 2 }, { "type": "loss", "content": 0.17431704699993134, "timestamp": "2025-09-02 14:15:55.174804", "step": 1856, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.226592", "step": 1856, "epoch": 2 }, { "type": "loss", "content": 0.14133046567440033, "timestamp": "2025-09-02 14:15:55.228951", "step": 1857, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:55.281973", "step": 1857, "epoch": 2 }, { "type": "loss", "content": 0.09215334802865982, "timestamp": "2025-09-02 14:15:55.284252", "step": 1858, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.338419", "step": 1858, "epoch": 2 }, { "type": "loss", "content": 0.11951219290494919, "timestamp": "2025-09-02 14:15:55.340907", "step": 1859, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:55.393684", "step": 1859, "epoch": 2 }, { "type": "loss", "content": 0.25280049443244934, "timestamp": "2025-09-02 14:15:55.399630", "step": 1860, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:15:57.266853", "step": 1860, "epoch": 2 }, { "type": "pplx", "content": 8051.074620293211, "timestamp": "2025-09-02 14:15:57.269194", "step": 1860, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:57.323171", "step": 1860, "epoch": 2 }, { "type": "loss", "content": 0.3373721241950989, "timestamp": "2025-09-02 14:15:57.325775", "step": 1861, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:57.380269", "step": 1861, "epoch": 2 }, { "type": "loss", "content": 0.16688603162765503, "timestamp": "2025-09-02 14:15:57.382610", "step": 1862, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.435481", "step": 1862, "epoch": 2 }, { "type": "loss", "content": 0.3392629623413086, "timestamp": "2025-09-02 14:15:57.437888", "step": 1863, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:57.491224", "step": 1863, "epoch": 2 }, { "type": "loss", "content": 0.1821957379579544, "timestamp": "2025-09-02 14:15:57.497444", "step": 1864, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.549649", "step": 1864, "epoch": 2 }, { "type": "loss", "content": 0.1835537701845169, "timestamp": "2025-09-02 14:15:57.551891", "step": 1865, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.604255", "step": 1865, "epoch": 2 }, { "type": "loss", "content": 0.19573135673999786, "timestamp": "2025-09-02 14:15:57.606601", "step": 1866, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:57.659553", "step": 1866, "epoch": 2 }, { "type": "loss", "content": 0.06953948736190796, "timestamp": "2025-09-02 14:15:57.661892", "step": 1867, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.714396", "step": 1867, "epoch": 2 }, { "type": "loss", "content": 0.11267454922199249, "timestamp": "2025-09-02 14:15:57.720391", "step": 1868, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.773967", "step": 1868, "epoch": 2 }, { "type": "loss", "content": 0.30883121490478516, "timestamp": "2025-09-02 14:15:57.776274", "step": 1869, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.829189", "step": 1869, "epoch": 2 }, { "type": "loss", "content": 0.17797790467739105, "timestamp": "2025-09-02 14:15:57.831904", "step": 1870, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:57.884833", "step": 1870, "epoch": 2 }, { "type": "loss", "content": 0.275183767080307, "timestamp": "2025-09-02 14:15:57.887170", "step": 1871, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.941115", "step": 1871, "epoch": 2 }, { "type": "loss", "content": 0.20413023233413696, "timestamp": "2025-09-02 14:15:57.947085", "step": 1872, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:57.998832", "step": 1872, "epoch": 2 }, { "type": "loss", "content": 0.15921097993850708, "timestamp": "2025-09-02 14:15:58.001084", "step": 1873, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:58.053445", "step": 1873, "epoch": 2 }, { "type": "loss", "content": 0.0773455873131752, "timestamp": "2025-09-02 14:15:58.055811", "step": 1874, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:58.108515", "step": 1874, "epoch": 2 }, { "type": "loss", "content": 0.2109753042459488, "timestamp": "2025-09-02 14:15:58.110917", "step": 1875, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:15:58.163877", "step": 1875, "epoch": 2 }, { "type": "loss", "content": 0.11343705654144287, "timestamp": "2025-09-02 14:15:58.169864", "step": 1876, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:58.221885", "step": 1876, "epoch": 2 }, { "type": "loss", "content": 0.2791711091995239, "timestamp": "2025-09-02 14:15:58.224346", "step": 1877, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:58.278284", "step": 1877, "epoch": 2 }, { "type": "loss", "content": 0.19599246978759766, "timestamp": "2025-09-02 14:15:58.280601", "step": 1878, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:15:58.333268", "step": 1878, "epoch": 2 }, { "type": "loss", "content": 0.12952861189842224, "timestamp": "2025-09-02 14:15:58.335501", "step": 1879, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:15:58.388478", "step": 1879, "epoch": 2 }, { "type": "loss", "content": 0.38494375348091125, "timestamp": "2025-09-02 14:15:58.394486", "step": 1880, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:00.256525", "step": 1880, "epoch": 2 }, { "type": "pplx", "content": 8163.74852442723, "timestamp": "2025-09-02 14:16:00.258994", "step": 1880, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1880", "timestamp": "2025-09-02 14:16:00.615363", "step": 1880, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:00.673168", "step": 1880, "epoch": 2 }, { "type": "loss", "content": 0.18038193881511688, "timestamp": "2025-09-02 14:16:00.675399", "step": 1881, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:00.730051", "step": 1881, "epoch": 2 }, { "type": "loss", "content": 0.1823219209909439, "timestamp": "2025-09-02 14:16:00.733509", "step": 1882, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:00.786813", "step": 1882, "epoch": 2 }, { "type": "loss", "content": 0.24104660749435425, "timestamp": "2025-09-02 14:16:00.789025", "step": 1883, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:00.842336", "step": 1883, "epoch": 2 }, { "type": "loss", "content": 0.21124954521656036, "timestamp": "2025-09-02 14:16:00.848630", "step": 1884, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:00.902432", "step": 1884, "epoch": 2 }, { "type": "loss", "content": 0.15783198177814484, "timestamp": "2025-09-02 14:16:00.904853", "step": 1885, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:00.958078", "step": 1885, "epoch": 2 }, { "type": "loss", "content": 0.13810290396213531, "timestamp": "2025-09-02 14:16:00.960938", "step": 1886, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:01.015660", "step": 1886, "epoch": 2 }, { "type": "loss", "content": 0.17798461019992828, "timestamp": "2025-09-02 14:16:01.018101", "step": 1887, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.070736", "step": 1887, "epoch": 2 }, { "type": "loss", "content": 0.14229047298431396, "timestamp": "2025-09-02 14:16:01.076711", "step": 1888, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.128792", "step": 1888, "epoch": 2 }, { "type": "loss", "content": 0.37958210706710815, "timestamp": "2025-09-02 14:16:01.131074", "step": 1889, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.184054", "step": 1889, "epoch": 2 }, { "type": "loss", "content": 0.2152203470468521, "timestamp": "2025-09-02 14:16:01.186603", "step": 1890, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.239646", "step": 1890, "epoch": 2 }, { "type": "loss", "content": 0.09206999093294144, "timestamp": "2025-09-02 14:16:01.241985", "step": 1891, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.295845", "step": 1891, "epoch": 2 }, { "type": "loss", "content": 0.12106619775295258, "timestamp": "2025-09-02 14:16:01.302142", "step": 1892, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.355666", "step": 1892, "epoch": 2 }, { "type": "loss", "content": 0.23240812122821808, "timestamp": "2025-09-02 14:16:01.357848", "step": 1893, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.414405", "step": 1893, "epoch": 2 }, { "type": "loss", "content": 0.1828063428401947, "timestamp": "2025-09-02 14:16:01.416927", "step": 1894, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.478106", "step": 1894, "epoch": 2 }, { "type": "loss", "content": 0.3519268333911896, "timestamp": "2025-09-02 14:16:01.480756", "step": 1895, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:01.534814", "step": 1895, "epoch": 2 }, { "type": "loss", "content": 0.22637444734573364, "timestamp": "2025-09-02 14:16:01.540784", "step": 1896, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:01.593303", "step": 1896, "epoch": 2 }, { "type": "loss", "content": 0.14882028102874756, "timestamp": "2025-09-02 14:16:01.595858", "step": 1897, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.649772", "step": 1897, "epoch": 2 }, { "type": "loss", "content": 0.12092243880033493, "timestamp": "2025-09-02 14:16:01.652067", "step": 1898, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.705188", "step": 1898, "epoch": 2 }, { "type": "loss", "content": 0.11533931642770767, "timestamp": "2025-09-02 14:16:01.707778", "step": 1899, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:01.760684", "step": 1899, "epoch": 2 }, { "type": "loss", "content": 0.2873263359069824, "timestamp": "2025-09-02 14:16:01.766688", "step": 1900, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:03.635883", "step": 1900, "epoch": 2 }, { "type": "pplx", "content": 8106.365164010821, "timestamp": "2025-09-02 14:16:03.638361", "step": 1900, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.691905", "step": 1900, "epoch": 2 }, { "type": "loss", "content": 0.15828180313110352, "timestamp": "2025-09-02 14:16:03.694223", "step": 1901, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.748810", "step": 1901, "epoch": 2 }, { "type": "loss", "content": 0.16440804302692413, "timestamp": "2025-09-02 14:16:03.751080", "step": 1902, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.804856", "step": 1902, "epoch": 2 }, { "type": "loss", "content": 0.09268023818731308, "timestamp": "2025-09-02 14:16:03.807532", "step": 1903, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.860391", "step": 1903, "epoch": 2 }, { "type": "loss", "content": 0.12378431111574173, "timestamp": "2025-09-02 14:16:03.866653", "step": 1904, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.918493", "step": 1904, "epoch": 2 }, { "type": "loss", "content": 0.18853148818016052, "timestamp": "2025-09-02 14:16:03.921080", "step": 1905, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:03.980418", "step": 1905, "epoch": 2 }, { "type": "loss", "content": 0.3089987635612488, "timestamp": "2025-09-02 14:16:03.982743", "step": 1906, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.036163", "step": 1906, "epoch": 2 }, { "type": "loss", "content": 0.1494036465883255, "timestamp": "2025-09-02 14:16:04.038419", "step": 1907, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.092090", "step": 1907, "epoch": 2 }, { "type": "loss", "content": 0.14358727633953094, "timestamp": "2025-09-02 14:16:04.098188", "step": 1908, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:04.150539", "step": 1908, "epoch": 2 }, { "type": "loss", "content": 0.12061045318841934, "timestamp": "2025-09-02 14:16:04.153262", "step": 1909, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:04.206215", "step": 1909, "epoch": 2 }, { "type": "loss", "content": 0.17427130043506622, "timestamp": "2025-09-02 14:16:04.208431", "step": 1910, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.261861", "step": 1910, "epoch": 2 }, { "type": "loss", "content": 0.10978276282548904, "timestamp": "2025-09-02 14:16:04.263961", "step": 1911, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.316770", "step": 1911, "epoch": 2 }, { "type": "loss", "content": 0.15086019039154053, "timestamp": "2025-09-02 14:16:04.322781", "step": 1912, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:04.376363", "step": 1912, "epoch": 2 }, { "type": "loss", "content": 0.08899417519569397, "timestamp": "2025-09-02 14:16:04.378729", "step": 1913, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.432036", "step": 1913, "epoch": 2 }, { "type": "loss", "content": 0.21305234730243683, "timestamp": "2025-09-02 14:16:04.434542", "step": 1914, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.487862", "step": 1914, "epoch": 2 }, { "type": "loss", "content": 0.24360013008117676, "timestamp": "2025-09-02 14:16:04.490272", "step": 1915, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.544059", "step": 1915, "epoch": 2 }, { "type": "loss", "content": 0.23946958780288696, "timestamp": "2025-09-02 14:16:04.550263", "step": 1916, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:04.603184", "step": 1916, "epoch": 2 }, { "type": "loss", "content": 0.12358441948890686, "timestamp": "2025-09-02 14:16:04.605730", "step": 1917, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.659032", "step": 1917, "epoch": 2 }, { "type": "loss", "content": 0.06910453736782074, "timestamp": "2025-09-02 14:16:04.662188", "step": 1918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:04.715461", "step": 1918, "epoch": 2 }, { "type": "loss", "content": 0.12555745244026184, "timestamp": "2025-09-02 14:16:04.717537", "step": 1919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:04.769812", "step": 1919, "epoch": 2 }, { "type": "loss", "content": 0.22887758910655975, "timestamp": "2025-09-02 14:16:04.775928", "step": 1920, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:06.675599", "step": 1920, "epoch": 2 }, { "type": "pplx", "content": 7990.2928293451605, "timestamp": "2025-09-02 14:16:06.677800", "step": 1920, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1920", "timestamp": "2025-09-02 14:16:07.046901", "step": 1920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.102687", "step": 1920, "epoch": 2 }, { "type": "loss", "content": 0.16038541495800018, "timestamp": "2025-09-02 14:16:07.105005", "step": 1921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.158817", "step": 1921, "epoch": 2 }, { "type": "loss", "content": 0.261304646730423, "timestamp": "2025-09-02 14:16:07.161155", "step": 1922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.215042", "step": 1922, "epoch": 2 }, { "type": "loss", "content": 0.25237518548965454, "timestamp": "2025-09-02 14:16:07.217798", "step": 1923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.270493", "step": 1923, "epoch": 2 }, { "type": "loss", "content": 0.12895306944847107, "timestamp": "2025-09-02 14:16:07.276737", "step": 1924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:07.329441", "step": 1924, "epoch": 2 }, { "type": "loss", "content": 0.14134518802165985, "timestamp": "2025-09-02 14:16:07.331658", "step": 1925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.384831", "step": 1925, "epoch": 2 }, { "type": "loss", "content": 0.3103735148906708, "timestamp": "2025-09-02 14:16:07.387313", "step": 1926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.440245", "step": 1926, "epoch": 2 }, { "type": "loss", "content": 0.13866457343101501, "timestamp": "2025-09-02 14:16:07.442721", "step": 1927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:07.497726", "step": 1927, "epoch": 2 }, { "type": "loss", "content": 0.03376142680644989, "timestamp": "2025-09-02 14:16:07.503610", "step": 1928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.556923", "step": 1928, "epoch": 2 }, { "type": "loss", "content": 0.35878056287765503, "timestamp": "2025-09-02 14:16:07.559161", "step": 1929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.612091", "step": 1929, "epoch": 2 }, { "type": "loss", "content": 0.1798470914363861, "timestamp": "2025-09-02 14:16:07.614717", "step": 1930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.667926", "step": 1930, "epoch": 2 }, { "type": "loss", "content": 0.09826577454805374, "timestamp": "2025-09-02 14:16:07.670337", "step": 1931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:07.723503", "step": 1931, "epoch": 2 }, { "type": "loss", "content": 0.16266728937625885, "timestamp": "2025-09-02 14:16:07.729690", "step": 1932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:07.781918", "step": 1932, "epoch": 2 }, { "type": "loss", "content": 0.1496214121580124, "timestamp": "2025-09-02 14:16:07.784494", "step": 1933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.837289", "step": 1933, "epoch": 2 }, { "type": "loss", "content": 0.23084792494773865, "timestamp": "2025-09-02 14:16:07.839929", "step": 1934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.893139", "step": 1934, "epoch": 2 }, { "type": "loss", "content": 0.15959103405475616, "timestamp": "2025-09-02 14:16:07.896385", "step": 1935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:07.949365", "step": 1935, "epoch": 2 }, { "type": "loss", "content": 0.13582269847393036, "timestamp": "2025-09-02 14:16:07.955502", "step": 1936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:08.007939", "step": 1936, "epoch": 2 }, { "type": "loss", "content": 0.16800452768802643, "timestamp": "2025-09-02 14:16:08.010334", "step": 1937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:08.063411", "step": 1937, "epoch": 2 }, { "type": "loss", "content": 0.2986774444580078, "timestamp": "2025-09-02 14:16:08.066290", "step": 1938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:08.119497", "step": 1938, "epoch": 2 }, { "type": "loss", "content": 0.10262059420347214, "timestamp": "2025-09-02 14:16:08.121704", "step": 1939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:08.174744", "step": 1939, "epoch": 2 }, { "type": "loss", "content": 0.14162565767765045, "timestamp": "2025-09-02 14:16:08.180632", "step": 1940, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:10.040126", "step": 1940, "epoch": 2 }, { "type": "pplx", "content": 7953.0890835082155, "timestamp": "2025-09-02 14:16:10.042695", "step": 1940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:10.096040", "step": 1940, "epoch": 2 }, { "type": "loss", "content": 0.10172519087791443, "timestamp": "2025-09-02 14:16:10.098200", "step": 1941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.152114", "step": 1941, "epoch": 2 }, { "type": "loss", "content": 0.1732034832239151, "timestamp": "2025-09-02 14:16:10.154377", "step": 1942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:10.207873", "step": 1942, "epoch": 2 }, { "type": "loss", "content": 0.21895946562290192, "timestamp": "2025-09-02 14:16:10.210161", "step": 1943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:10.263650", "step": 1943, "epoch": 2 }, { "type": "loss", "content": 0.1304192841053009, "timestamp": "2025-09-02 14:16:10.269620", "step": 1944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.322144", "step": 1944, "epoch": 2 }, { "type": "loss", "content": 0.13473154604434967, "timestamp": "2025-09-02 14:16:10.324124", "step": 1945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.377442", "step": 1945, "epoch": 2 }, { "type": "loss", "content": 0.16839873790740967, "timestamp": "2025-09-02 14:16:10.379440", "step": 1946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.434292", "step": 1946, "epoch": 2 }, { "type": "loss", "content": 0.20952889323234558, "timestamp": "2025-09-02 14:16:10.436378", "step": 1947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.489396", "step": 1947, "epoch": 2 }, { "type": "loss", "content": 0.11813486367464066, "timestamp": "2025-09-02 14:16:10.495321", "step": 1948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.549271", "step": 1948, "epoch": 2 }, { "type": "loss", "content": 0.17881208658218384, "timestamp": "2025-09-02 14:16:10.551521", "step": 1949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:10.604781", "step": 1949, "epoch": 2 }, { "type": "loss", "content": 0.23326382040977478, "timestamp": "2025-09-02 14:16:10.607214", "step": 1950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:10.660417", "step": 1950, "epoch": 2 }, { "type": "loss", "content": 0.1910639852285385, "timestamp": "2025-09-02 14:16:10.662925", "step": 1951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.716269", "step": 1951, "epoch": 2 }, { "type": "loss", "content": 0.30274420976638794, "timestamp": "2025-09-02 14:16:10.721647", "step": 1952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.774117", "step": 1952, "epoch": 2 }, { "type": "loss", "content": 0.12615236639976501, "timestamp": "2025-09-02 14:16:10.775981", "step": 1953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:10.830421", "step": 1953, "epoch": 2 }, { "type": "loss", "content": 0.1378999799489975, "timestamp": "2025-09-02 14:16:10.832545", "step": 1954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.885492", "step": 1954, "epoch": 2 }, { "type": "loss", "content": 0.34808650612831116, "timestamp": "2025-09-02 14:16:10.888965", "step": 1955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:10.942238", "step": 1955, "epoch": 2 }, { "type": "loss", "content": 0.19415098428726196, "timestamp": "2025-09-02 14:16:10.948266", "step": 1956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:11.001151", "step": 1956, "epoch": 2 }, { "type": "loss", "content": 0.15082013607025146, "timestamp": "2025-09-02 14:16:11.003335", "step": 1957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:11.056071", "step": 1957, "epoch": 2 }, { "type": "loss", "content": 0.24126788973808289, "timestamp": "2025-09-02 14:16:11.058420", "step": 1958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:11.111788", "step": 1958, "epoch": 2 }, { "type": "loss", "content": 0.14246144890785217, "timestamp": "2025-09-02 14:16:11.114245", "step": 1959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:11.167280", "step": 1959, "epoch": 2 }, { "type": "loss", "content": 0.1632463037967682, "timestamp": "2025-09-02 14:16:11.172846", "step": 1960, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:13.039850", "step": 1960, "epoch": 2 }, { "type": "pplx", "content": 7960.233422241575, "timestamp": "2025-09-02 14:16:13.042153", "step": 1960, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1960", "timestamp": "2025-09-02 14:16:13.410827", "step": 1960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.467392", "step": 1960, "epoch": 2 }, { "type": "loss", "content": 0.2281976044178009, "timestamp": "2025-09-02 14:16:13.469726", "step": 1961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:13.523777", "step": 1961, "epoch": 2 }, { "type": "loss", "content": 0.23773130774497986, "timestamp": "2025-09-02 14:16:13.525720", "step": 1962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.587970", "step": 1962, "epoch": 2 }, { "type": "loss", "content": 0.17593617737293243, "timestamp": "2025-09-02 14:16:13.589765", "step": 1963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.642219", "step": 1963, "epoch": 2 }, { "type": "loss", "content": 0.2800937592983246, "timestamp": "2025-09-02 14:16:13.648606", "step": 1964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:13.701399", "step": 1964, "epoch": 2 }, { "type": "loss", "content": 0.15633702278137207, "timestamp": "2025-09-02 14:16:13.704564", "step": 1965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:13.758336", "step": 1965, "epoch": 2 }, { "type": "loss", "content": 0.17571289837360382, "timestamp": "2025-09-02 14:16:13.760929", "step": 1966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.815712", "step": 1966, "epoch": 2 }, { "type": "loss", "content": 0.15679527819156647, "timestamp": "2025-09-02 14:16:13.818065", "step": 1967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.871111", "step": 1967, "epoch": 2 }, { "type": "loss", "content": 0.1573464572429657, "timestamp": "2025-09-02 14:16:13.877389", "step": 1968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.930127", "step": 1968, "epoch": 2 }, { "type": "loss", "content": 0.22073139250278473, "timestamp": "2025-09-02 14:16:13.932732", "step": 1969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:13.985912", "step": 1969, "epoch": 2 }, { "type": "loss", "content": 0.16790585219860077, "timestamp": "2025-09-02 14:16:13.988072", "step": 1970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.042119", "step": 1970, "epoch": 2 }, { "type": "loss", "content": 0.26933085918426514, "timestamp": "2025-09-02 14:16:14.044037", "step": 1971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.097227", "step": 1971, "epoch": 2 }, { "type": "loss", "content": 0.21755839884281158, "timestamp": "2025-09-02 14:16:14.103360", "step": 1972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.156651", "step": 1972, "epoch": 2 }, { "type": "loss", "content": 0.19257321953773499, "timestamp": "2025-09-02 14:16:14.159023", "step": 1973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.213100", "step": 1973, "epoch": 2 }, { "type": "loss", "content": 0.23624449968338013, "timestamp": "2025-09-02 14:16:14.215535", "step": 1974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.268727", "step": 1974, "epoch": 2 }, { "type": "loss", "content": 0.18369229137897491, "timestamp": "2025-09-02 14:16:14.271239", "step": 1975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:14.324947", "step": 1975, "epoch": 2 }, { "type": "loss", "content": 0.11342668533325195, "timestamp": "2025-09-02 14:16:14.331236", "step": 1976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.383923", "step": 1976, "epoch": 2 }, { "type": "loss", "content": 0.27553847432136536, "timestamp": "2025-09-02 14:16:14.386227", "step": 1977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.439150", "step": 1977, "epoch": 2 }, { "type": "loss", "content": 0.19941143691539764, "timestamp": "2025-09-02 14:16:14.441401", "step": 1978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:14.494942", "step": 1978, "epoch": 2 }, { "type": "loss", "content": 0.10803364962339401, "timestamp": "2025-09-02 14:16:14.497302", "step": 1979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:14.550256", "step": 1979, "epoch": 2 }, { "type": "loss", "content": 0.1528846174478531, "timestamp": "2025-09-02 14:16:14.556473", "step": 1980, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:16.454127", "step": 1980, "epoch": 2 }, { "type": "pplx", "content": 7993.463273603584, "timestamp": "2025-09-02 14:16:16.456726", "step": 1980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:16.510953", "step": 1980, "epoch": 2 }, { "type": "loss", "content": 0.1467866748571396, "timestamp": "2025-09-02 14:16:16.513609", "step": 1981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:16.567987", "step": 1981, "epoch": 2 }, { "type": "loss", "content": 0.1743169128894806, "timestamp": "2025-09-02 14:16:16.570474", "step": 1982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:16.624054", "step": 1982, "epoch": 2 }, { "type": "loss", "content": 0.41603419184684753, "timestamp": "2025-09-02 14:16:16.626523", "step": 1983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:16.680238", "step": 1983, "epoch": 2 }, { "type": "loss", "content": 0.0516364648938179, "timestamp": "2025-09-02 14:16:16.686287", "step": 1984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:16.739918", "step": 1984, "epoch": 2 }, { "type": "loss", "content": 0.10951203107833862, "timestamp": "2025-09-02 14:16:16.742055", "step": 1985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:16.795282", "step": 1985, "epoch": 2 }, { "type": "loss", "content": 0.19981423020362854, "timestamp": "2025-09-02 14:16:16.797624", "step": 1986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:16.851149", "step": 1986, "epoch": 2 }, { "type": "loss", "content": 0.1973690241575241, "timestamp": "2025-09-02 14:16:16.853882", "step": 1987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:16.907425", "step": 1987, "epoch": 2 }, { "type": "loss", "content": 0.2778198719024658, "timestamp": "2025-09-02 14:16:16.913693", "step": 1988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:16.966480", "step": 1988, "epoch": 2 }, { "type": "loss", "content": 0.166127011179924, "timestamp": "2025-09-02 14:16:16.968858", "step": 1989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.021349", "step": 1989, "epoch": 2 }, { "type": "loss", "content": 0.1644371598958969, "timestamp": "2025-09-02 14:16:17.023721", "step": 1990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.076913", "step": 1990, "epoch": 2 }, { "type": "loss", "content": 0.18692927062511444, "timestamp": "2025-09-02 14:16:17.079370", "step": 1991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:17.132903", "step": 1991, "epoch": 2 }, { "type": "loss", "content": 0.105436772108078, "timestamp": "2025-09-02 14:16:17.139128", "step": 1992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.191672", "step": 1992, "epoch": 2 }, { "type": "loss", "content": 0.2731904983520508, "timestamp": "2025-09-02 14:16:17.194070", "step": 1993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.247065", "step": 1993, "epoch": 2 }, { "type": "loss", "content": 0.2201014906167984, "timestamp": "2025-09-02 14:16:17.249406", "step": 1994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.302411", "step": 1994, "epoch": 2 }, { "type": "loss", "content": 0.352260559797287, "timestamp": "2025-09-02 14:16:17.305259", "step": 1995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.358314", "step": 1995, "epoch": 2 }, { "type": "loss", "content": 0.25086531043052673, "timestamp": "2025-09-02 14:16:17.364172", "step": 1996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.418446", "step": 1996, "epoch": 2 }, { "type": "loss", "content": 0.19555626809597015, "timestamp": "2025-09-02 14:16:17.421510", "step": 1997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:17.475248", "step": 1997, "epoch": 2 }, { "type": "loss", "content": 0.2143610268831253, "timestamp": "2025-09-02 14:16:17.477891", "step": 1998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.530921", "step": 1998, "epoch": 2 }, { "type": "loss", "content": 0.18067225813865662, "timestamp": "2025-09-02 14:16:17.533376", "step": 1999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:17.586425", "step": 1999, "epoch": 2 }, { "type": "loss", "content": 0.19839850068092346, "timestamp": "2025-09-02 14:16:17.592368", "step": 2000, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:19.481570", "step": 2000, "epoch": 2 }, { "type": "pplx", "content": 7987.858869879088, "timestamp": "2025-09-02 14:16:19.483605", "step": 2000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-02 14:16:19.856544", "step": 2000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:19.914313", "step": 2000, "epoch": 2 }, { "type": "loss", "content": 0.17432831227779388, "timestamp": "2025-09-02 14:16:19.916610", "step": 2001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:19.970893", "step": 2001, "epoch": 2 }, { "type": "loss", "content": 0.07408805936574936, "timestamp": "2025-09-02 14:16:19.973728", "step": 2002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.028739", "step": 2002, "epoch": 2 }, { "type": "loss", "content": 0.2091265320777893, "timestamp": "2025-09-02 14:16:20.031182", "step": 2003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.084420", "step": 2003, "epoch": 2 }, { "type": "loss", "content": 0.14582698047161102, "timestamp": "2025-09-02 14:16:20.090490", "step": 2004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:20.143386", "step": 2004, "epoch": 2 }, { "type": "loss", "content": 0.1183430403470993, "timestamp": "2025-09-02 14:16:20.145942", "step": 2005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.199523", "step": 2005, "epoch": 2 }, { "type": "loss", "content": 0.10556184500455856, "timestamp": "2025-09-02 14:16:20.201938", "step": 2006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.255936", "step": 2006, "epoch": 2 }, { "type": "loss", "content": 0.27730926871299744, "timestamp": "2025-09-02 14:16:20.258161", "step": 2007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:20.311445", "step": 2007, "epoch": 2 }, { "type": "loss", "content": 0.21210278570652008, "timestamp": "2025-09-02 14:16:20.317581", "step": 2008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.370481", "step": 2008, "epoch": 2 }, { "type": "loss", "content": 0.12592700123786926, "timestamp": "2025-09-02 14:16:20.372864", "step": 2009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:20.426481", "step": 2009, "epoch": 2 }, { "type": "loss", "content": 0.22752872109413147, "timestamp": "2025-09-02 14:16:20.428741", "step": 2010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.482406", "step": 2010, "epoch": 2 }, { "type": "loss", "content": 0.11089470237493515, "timestamp": "2025-09-02 14:16:20.484811", "step": 2011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.538367", "step": 2011, "epoch": 2 }, { "type": "loss", "content": 0.12564495205879211, "timestamp": "2025-09-02 14:16:20.544245", "step": 2012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:20.598112", "step": 2012, "epoch": 2 }, { "type": "loss", "content": 0.14139717817306519, "timestamp": "2025-09-02 14:16:20.600951", "step": 2013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.654701", "step": 2013, "epoch": 2 }, { "type": "loss", "content": 0.11101257055997849, "timestamp": "2025-09-02 14:16:20.657326", "step": 2014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:20.711131", "step": 2014, "epoch": 2 }, { "type": "loss", "content": 0.2820224463939667, "timestamp": "2025-09-02 14:16:20.714112", "step": 2015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.767244", "step": 2015, "epoch": 2 }, { "type": "loss", "content": 0.18697291612625122, "timestamp": "2025-09-02 14:16:20.773434", "step": 2016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.826324", "step": 2016, "epoch": 2 }, { "type": "loss", "content": 0.15529079735279083, "timestamp": "2025-09-02 14:16:20.828872", "step": 2017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:20.881956", "step": 2017, "epoch": 2 }, { "type": "loss", "content": 0.12250766158103943, "timestamp": "2025-09-02 14:16:20.884477", "step": 2018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:20.938314", "step": 2018, "epoch": 2 }, { "type": "loss", "content": 0.23016293346881866, "timestamp": "2025-09-02 14:16:20.941053", "step": 2019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:20.994607", "step": 2019, "epoch": 2 }, { "type": "loss", "content": 0.11147753894329071, "timestamp": "2025-09-02 14:16:21.000940", "step": 2020, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:22.915366", "step": 2020, "epoch": 2 }, { "type": "pplx", "content": 8212.539708974062, "timestamp": "2025-09-02 14:16:22.918089", "step": 2020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:22.971912", "step": 2020, "epoch": 2 }, { "type": "loss", "content": 0.18212635815143585, "timestamp": "2025-09-02 14:16:22.974888", "step": 2021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.028930", "step": 2021, "epoch": 2 }, { "type": "loss", "content": 0.1127689778804779, "timestamp": "2025-09-02 14:16:23.031204", "step": 2022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.084610", "step": 2022, "epoch": 2 }, { "type": "loss", "content": 0.19303834438323975, "timestamp": "2025-09-02 14:16:23.087141", "step": 2023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:23.140865", "step": 2023, "epoch": 2 }, { "type": "loss", "content": 0.15409202873706818, "timestamp": "2025-09-02 14:16:23.147873", "step": 2024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.200341", "step": 2024, "epoch": 2 }, { "type": "loss", "content": 0.05481535568833351, "timestamp": "2025-09-02 14:16:23.202708", "step": 2025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.255627", "step": 2025, "epoch": 2 }, { "type": "loss", "content": 0.22515831887722015, "timestamp": "2025-09-02 14:16:23.257938", "step": 2026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.311090", "step": 2026, "epoch": 2 }, { "type": "loss", "content": 0.125798761844635, "timestamp": "2025-09-02 14:16:23.313460", "step": 2027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.366896", "step": 2027, "epoch": 2 }, { "type": "loss", "content": 0.10671111196279526, "timestamp": "2025-09-02 14:16:23.373075", "step": 2028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.425423", "step": 2028, "epoch": 2 }, { "type": "loss", "content": 0.12056569010019302, "timestamp": "2025-09-02 14:16:23.428415", "step": 2029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.481560", "step": 2029, "epoch": 2 }, { "type": "loss", "content": 0.158100426197052, "timestamp": "2025-09-02 14:16:23.483685", "step": 2030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:23.537941", "step": 2030, "epoch": 2 }, { "type": "loss", "content": 0.13019317388534546, "timestamp": "2025-09-02 14:16:23.539981", "step": 2031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.593091", "step": 2031, "epoch": 2 }, { "type": "loss", "content": 0.22895416617393494, "timestamp": "2025-09-02 14:16:23.599111", "step": 2032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:23.652277", "step": 2032, "epoch": 2 }, { "type": "loss", "content": 0.23401188850402832, "timestamp": "2025-09-02 14:16:23.654476", "step": 2033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.707689", "step": 2033, "epoch": 2 }, { "type": "loss", "content": 0.12030984461307526, "timestamp": "2025-09-02 14:16:23.709849", "step": 2034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:23.762995", "step": 2034, "epoch": 2 }, { "type": "loss", "content": 0.1536511927843094, "timestamp": "2025-09-02 14:16:23.765324", "step": 2035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.819031", "step": 2035, "epoch": 2 }, { "type": "loss", "content": 0.1710452288389206, "timestamp": "2025-09-02 14:16:23.825044", "step": 2036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.877548", "step": 2036, "epoch": 2 }, { "type": "loss", "content": 0.18776728212833405, "timestamp": "2025-09-02 14:16:23.879595", "step": 2037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.932768", "step": 2037, "epoch": 2 }, { "type": "loss", "content": 0.1916312724351883, "timestamp": "2025-09-02 14:16:23.934977", "step": 2038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:23.988205", "step": 2038, "epoch": 2 }, { "type": "loss", "content": 0.23317372798919678, "timestamp": "2025-09-02 14:16:23.990393", "step": 2039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:24.044391", "step": 2039, "epoch": 2 }, { "type": "loss", "content": 0.16263718903064728, "timestamp": "2025-09-02 14:16:24.050570", "step": 2040, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:25.932560", "step": 2040, "epoch": 2 }, { "type": "pplx", "content": 8421.456705079483, "timestamp": "2025-09-02 14:16:25.935119", "step": 2040, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2040", "timestamp": "2025-09-02 14:16:26.336337", "step": 2040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.391950", "step": 2040, "epoch": 2 }, { "type": "loss", "content": 0.10665247589349747, "timestamp": "2025-09-02 14:16:26.394440", "step": 2041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.450657", "step": 2041, "epoch": 2 }, { "type": "loss", "content": 0.10708825290203094, "timestamp": "2025-09-02 14:16:26.452986", "step": 2042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.507440", "step": 2042, "epoch": 2 }, { "type": "loss", "content": 0.20123836398124695, "timestamp": "2025-09-02 14:16:26.509570", "step": 2043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.563265", "step": 2043, "epoch": 2 }, { "type": "loss", "content": 0.14129450917243958, "timestamp": "2025-09-02 14:16:26.569851", "step": 2044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.626119", "step": 2044, "epoch": 2 }, { "type": "loss", "content": 0.17072954773902893, "timestamp": "2025-09-02 14:16:26.628252", "step": 2045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.681642", "step": 2045, "epoch": 2 }, { "type": "loss", "content": 0.15123789012432098, "timestamp": "2025-09-02 14:16:26.684249", "step": 2046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.740828", "step": 2046, "epoch": 2 }, { "type": "loss", "content": 0.1730678528547287, "timestamp": "2025-09-02 14:16:26.743518", "step": 2047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.797331", "step": 2047, "epoch": 2 }, { "type": "loss", "content": 0.29943737387657166, "timestamp": "2025-09-02 14:16:26.803740", "step": 2048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.856915", "step": 2048, "epoch": 2 }, { "type": "loss", "content": 0.11346372216939926, "timestamp": "2025-09-02 14:16:26.859459", "step": 2049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:26.913292", "step": 2049, "epoch": 2 }, { "type": "loss", "content": 0.10751362144947052, "timestamp": "2025-09-02 14:16:26.915623", "step": 2050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:26.969400", "step": 2050, "epoch": 2 }, { "type": "loss", "content": 0.1823345124721527, "timestamp": "2025-09-02 14:16:26.971557", "step": 2051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.024762", "step": 2051, "epoch": 2 }, { "type": "loss", "content": 0.12020079791545868, "timestamp": "2025-09-02 14:16:27.030997", "step": 2052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:27.084372", "step": 2052, "epoch": 2 }, { "type": "loss", "content": 0.2887284755706787, "timestamp": "2025-09-02 14:16:27.086388", "step": 2053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.139267", "step": 2053, "epoch": 2 }, { "type": "loss", "content": 0.13805408775806427, "timestamp": "2025-09-02 14:16:27.141898", "step": 2054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.196417", "step": 2054, "epoch": 2 }, { "type": "loss", "content": 0.14332543313503265, "timestamp": "2025-09-02 14:16:27.198722", "step": 2055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:27.252634", "step": 2055, "epoch": 2 }, { "type": "loss", "content": 0.14242342114448547, "timestamp": "2025-09-02 14:16:27.258767", "step": 2056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.311396", "step": 2056, "epoch": 2 }, { "type": "loss", "content": 0.22604402899742126, "timestamp": "2025-09-02 14:16:27.313750", "step": 2057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.366803", "step": 2057, "epoch": 2 }, { "type": "loss", "content": 0.2967451810836792, "timestamp": "2025-09-02 14:16:27.369041", "step": 2058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:27.422626", "step": 2058, "epoch": 2 }, { "type": "loss", "content": 0.1109696552157402, "timestamp": "2025-09-02 14:16:27.424995", "step": 2059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:27.478768", "step": 2059, "epoch": 2 }, { "type": "loss", "content": 0.12601907551288605, "timestamp": "2025-09-02 14:16:27.484863", "step": 2060, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:29.382272", "step": 2060, "epoch": 2 }, { "type": "pplx", "content": 8250.51666542273, "timestamp": "2025-09-02 14:16:29.384422", "step": 2060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.437433", "step": 2060, "epoch": 2 }, { "type": "loss", "content": 0.1437065303325653, "timestamp": "2025-09-02 14:16:29.439964", "step": 2061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.493804", "step": 2061, "epoch": 2 }, { "type": "loss", "content": 0.22151115536689758, "timestamp": "2025-09-02 14:16:29.496146", "step": 2062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.550095", "step": 2062, "epoch": 2 }, { "type": "loss", "content": 0.2869545817375183, "timestamp": "2025-09-02 14:16:29.552666", "step": 2063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:29.606186", "step": 2063, "epoch": 2 }, { "type": "loss", "content": 0.12266626209020615, "timestamp": "2025-09-02 14:16:29.612497", "step": 2064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:29.665400", "step": 2064, "epoch": 2 }, { "type": "loss", "content": 0.14543543756008148, "timestamp": "2025-09-02 14:16:29.667745", "step": 2065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.721484", "step": 2065, "epoch": 2 }, { "type": "loss", "content": 0.10597895830869675, "timestamp": "2025-09-02 14:16:29.723540", "step": 2066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.779611", "step": 2066, "epoch": 2 }, { "type": "loss", "content": 0.1577940732240677, "timestamp": "2025-09-02 14:16:29.782139", "step": 2067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.835928", "step": 2067, "epoch": 2 }, { "type": "loss", "content": 0.27325963973999023, "timestamp": "2025-09-02 14:16:29.842042", "step": 2068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.894635", "step": 2068, "epoch": 2 }, { "type": "loss", "content": 0.22532342374324799, "timestamp": "2025-09-02 14:16:29.897020", "step": 2069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:29.950185", "step": 2069, "epoch": 2 }, { "type": "loss", "content": 0.23822250962257385, "timestamp": "2025-09-02 14:16:29.952522", "step": 2070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.005777", "step": 2070, "epoch": 2 }, { "type": "loss", "content": 0.1157916858792305, "timestamp": "2025-09-02 14:16:30.008063", "step": 2071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.061159", "step": 2071, "epoch": 2 }, { "type": "loss", "content": 0.12981998920440674, "timestamp": "2025-09-02 14:16:30.067402", "step": 2072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.120361", "step": 2072, "epoch": 2 }, { "type": "loss", "content": 0.15163695812225342, "timestamp": "2025-09-02 14:16:30.122732", "step": 2073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:30.176735", "step": 2073, "epoch": 2 }, { "type": "loss", "content": 0.18701499700546265, "timestamp": "2025-09-02 14:16:30.178965", "step": 2074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.231859", "step": 2074, "epoch": 2 }, { "type": "loss", "content": 0.22861644625663757, "timestamp": "2025-09-02 14:16:30.234340", "step": 2075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.287671", "step": 2075, "epoch": 2 }, { "type": "loss", "content": 0.1636151820421219, "timestamp": "2025-09-02 14:16:30.294211", "step": 2076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:30.346909", "step": 2076, "epoch": 2 }, { "type": "loss", "content": 0.1674899011850357, "timestamp": "2025-09-02 14:16:30.349282", "step": 2077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.402806", "step": 2077, "epoch": 2 }, { "type": "loss", "content": 0.23478244245052338, "timestamp": "2025-09-02 14:16:30.405014", "step": 2078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:30.458115", "step": 2078, "epoch": 2 }, { "type": "loss", "content": 0.19238793849945068, "timestamp": "2025-09-02 14:16:30.460229", "step": 2079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:30.513333", "step": 2079, "epoch": 2 }, { "type": "loss", "content": 0.22729843854904175, "timestamp": "2025-09-02 14:16:30.519212", "step": 2080, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:32.436380", "step": 2080, "epoch": 2 }, { "type": "pplx", "content": 8258.533625920138, "timestamp": "2025-09-02 14:16:32.439061", "step": 2080, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2080", "timestamp": "2025-09-02 14:16:32.802432", "step": 2080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:32.856867", "step": 2080, "epoch": 2 }, { "type": "loss", "content": 0.19939005374908447, "timestamp": "2025-09-02 14:16:32.859303", "step": 2081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:32.913739", "step": 2081, "epoch": 2 }, { "type": "loss", "content": 0.20496052503585815, "timestamp": "2025-09-02 14:16:32.915894", "step": 2082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:32.972494", "step": 2082, "epoch": 2 }, { "type": "loss", "content": 0.25080406665802, "timestamp": "2025-09-02 14:16:32.975729", "step": 2083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.030385", "step": 2083, "epoch": 2 }, { "type": "loss", "content": 0.09147825092077255, "timestamp": "2025-09-02 14:16:33.037254", "step": 2084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:33.092464", "step": 2084, "epoch": 2 }, { "type": "loss", "content": 0.07453519850969315, "timestamp": "2025-09-02 14:16:33.094528", "step": 2085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.149026", "step": 2085, "epoch": 2 }, { "type": "loss", "content": 0.11507782340049744, "timestamp": "2025-09-02 14:16:33.151931", "step": 2086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.205816", "step": 2086, "epoch": 2 }, { "type": "loss", "content": 0.15544471144676208, "timestamp": "2025-09-02 14:16:33.208153", "step": 2087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.261336", "step": 2087, "epoch": 2 }, { "type": "loss", "content": 0.12246864289045334, "timestamp": "2025-09-02 14:16:33.267787", "step": 2088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.323818", "step": 2088, "epoch": 2 }, { "type": "loss", "content": 0.12214968353509903, "timestamp": "2025-09-02 14:16:33.326394", "step": 2089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.379601", "step": 2089, "epoch": 2 }, { "type": "loss", "content": 0.21492807567119598, "timestamp": "2025-09-02 14:16:33.382030", "step": 2090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.436740", "step": 2090, "epoch": 2 }, { "type": "loss", "content": 0.11759039014577866, "timestamp": "2025-09-02 14:16:33.439014", "step": 2091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:33.494712", "step": 2091, "epoch": 2 }, { "type": "loss", "content": 0.14825695753097534, "timestamp": "2025-09-02 14:16:33.500943", "step": 2092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.554919", "step": 2092, "epoch": 2 }, { "type": "loss", "content": 0.0817965716123581, "timestamp": "2025-09-02 14:16:33.558007", "step": 2093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:33.613665", "step": 2093, "epoch": 2 }, { "type": "loss", "content": 0.2972908914089203, "timestamp": "2025-09-02 14:16:33.617345", "step": 2094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:33.671758", "step": 2094, "epoch": 2 }, { "type": "loss", "content": 0.22624477744102478, "timestamp": "2025-09-02 14:16:33.674232", "step": 2095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.730029", "step": 2095, "epoch": 2 }, { "type": "loss", "content": 0.24540981650352478, "timestamp": "2025-09-02 14:16:33.736510", "step": 2096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.789299", "step": 2096, "epoch": 2 }, { "type": "loss", "content": 0.16782015562057495, "timestamp": "2025-09-02 14:16:33.791701", "step": 2097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.846528", "step": 2097, "epoch": 2 }, { "type": "loss", "content": 0.13809500634670258, "timestamp": "2025-09-02 14:16:33.848949", "step": 2098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:33.902667", "step": 2098, "epoch": 2 }, { "type": "loss", "content": 0.10651980340480804, "timestamp": "2025-09-02 14:16:33.905134", "step": 2099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:33.959370", "step": 2099, "epoch": 2 }, { "type": "loss", "content": 0.21921372413635254, "timestamp": "2025-09-02 14:16:33.965644", "step": 2100, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:35.883213", "step": 2100, "epoch": 2 }, { "type": "pplx", "content": 8300.696578745585, "timestamp": "2025-09-02 14:16:35.885960", "step": 2100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:35.939081", "step": 2100, "epoch": 2 }, { "type": "loss", "content": 0.11928582936525345, "timestamp": "2025-09-02 14:16:35.941852", "step": 2101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:35.996414", "step": 2101, "epoch": 2 }, { "type": "loss", "content": 0.14046332240104675, "timestamp": "2025-09-02 14:16:35.998918", "step": 2102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:36.052913", "step": 2102, "epoch": 2 }, { "type": "loss", "content": 0.2966403067111969, "timestamp": "2025-09-02 14:16:36.055262", "step": 2103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:36.109623", "step": 2103, "epoch": 2 }, { "type": "loss", "content": 0.111586794257164, "timestamp": "2025-09-02 14:16:36.116019", "step": 2104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.169254", "step": 2104, "epoch": 2 }, { "type": "loss", "content": 0.22482076287269592, "timestamp": "2025-09-02 14:16:36.172020", "step": 2105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.225521", "step": 2105, "epoch": 2 }, { "type": "loss", "content": 0.1619042158126831, "timestamp": "2025-09-02 14:16:36.228513", "step": 2106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:36.281958", "step": 2106, "epoch": 2 }, { "type": "loss", "content": 0.24068506062030792, "timestamp": "2025-09-02 14:16:36.284424", "step": 2107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.337855", "step": 2107, "epoch": 2 }, { "type": "loss", "content": 0.1550178825855255, "timestamp": "2025-09-02 14:16:36.344246", "step": 2108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.397727", "step": 2108, "epoch": 2 }, { "type": "loss", "content": 0.16483092308044434, "timestamp": "2025-09-02 14:16:36.400318", "step": 2109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.453761", "step": 2109, "epoch": 2 }, { "type": "loss", "content": 0.24241426587104797, "timestamp": "2025-09-02 14:16:36.456285", "step": 2110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.510144", "step": 2110, "epoch": 2 }, { "type": "loss", "content": 0.0761779174208641, "timestamp": "2025-09-02 14:16:36.513420", "step": 2111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.567028", "step": 2111, "epoch": 2 }, { "type": "loss", "content": 0.2285076528787613, "timestamp": "2025-09-02 14:16:36.573139", "step": 2112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.626651", "step": 2112, "epoch": 2 }, { "type": "loss", "content": 0.22308595478534698, "timestamp": "2025-09-02 14:16:36.629061", "step": 2113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.682056", "step": 2113, "epoch": 2 }, { "type": "loss", "content": 0.10022953152656555, "timestamp": "2025-09-02 14:16:36.684383", "step": 2114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.738488", "step": 2114, "epoch": 2 }, { "type": "loss", "content": 0.19231818616390228, "timestamp": "2025-09-02 14:16:36.741079", "step": 2115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:36.796151", "step": 2115, "epoch": 2 }, { "type": "loss", "content": 0.2764391303062439, "timestamp": "2025-09-02 14:16:36.802167", "step": 2116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:36.858019", "step": 2116, "epoch": 2 }, { "type": "loss", "content": 0.24718226492404938, "timestamp": "2025-09-02 14:16:36.860312", "step": 2117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:36.914825", "step": 2117, "epoch": 2 }, { "type": "loss", "content": 0.1310458779335022, "timestamp": "2025-09-02 14:16:36.917333", "step": 2118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:36.972014", "step": 2118, "epoch": 2 }, { "type": "loss", "content": 0.21107296645641327, "timestamp": "2025-09-02 14:16:36.974447", "step": 2119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:37.027769", "step": 2119, "epoch": 2 }, { "type": "loss", "content": 0.24468503892421722, "timestamp": "2025-09-02 14:16:37.033781", "step": 2120, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:38.949462", "step": 2120, "epoch": 2 }, { "type": "pplx", "content": 8365.434450807103, "timestamp": "2025-09-02 14:16:38.951723", "step": 2120, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2120", "timestamp": "2025-09-02 14:16:39.311332", "step": 2120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.366083", "step": 2120, "epoch": 2 }, { "type": "loss", "content": 0.1883801817893982, "timestamp": "2025-09-02 14:16:39.368503", "step": 2121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.422463", "step": 2121, "epoch": 2 }, { "type": "loss", "content": 0.20216074585914612, "timestamp": "2025-09-02 14:16:39.425081", "step": 2122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.478154", "step": 2122, "epoch": 2 }, { "type": "loss", "content": 0.26431113481521606, "timestamp": "2025-09-02 14:16:39.480383", "step": 2123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.533331", "step": 2123, "epoch": 2 }, { "type": "loss", "content": 0.11927561461925507, "timestamp": "2025-09-02 14:16:39.539822", "step": 2124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.592072", "step": 2124, "epoch": 2 }, { "type": "loss", "content": 0.06770074367523193, "timestamp": "2025-09-02 14:16:39.594523", "step": 2125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.647934", "step": 2125, "epoch": 2 }, { "type": "loss", "content": 0.1345909982919693, "timestamp": "2025-09-02 14:16:39.651561", "step": 2126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.704313", "step": 2126, "epoch": 2 }, { "type": "loss", "content": 0.14271077513694763, "timestamp": "2025-09-02 14:16:39.706585", "step": 2127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.767205", "step": 2127, "epoch": 2 }, { "type": "loss", "content": 0.08749767392873764, "timestamp": "2025-09-02 14:16:39.773118", "step": 2128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.825196", "step": 2128, "epoch": 2 }, { "type": "loss", "content": 0.4145244061946869, "timestamp": "2025-09-02 14:16:39.827318", "step": 2129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.880174", "step": 2129, "epoch": 2 }, { "type": "loss", "content": 0.14630192518234253, "timestamp": "2025-09-02 14:16:39.882396", "step": 2130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.935724", "step": 2130, "epoch": 2 }, { "type": "loss", "content": 0.18521857261657715, "timestamp": "2025-09-02 14:16:39.937994", "step": 2131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:39.990721", "step": 2131, "epoch": 2 }, { "type": "loss", "content": 0.10139771550893784, "timestamp": "2025-09-02 14:16:39.996887", "step": 2132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:40.049813", "step": 2132, "epoch": 2 }, { "type": "loss", "content": 0.1867779791355133, "timestamp": "2025-09-02 14:16:40.052089", "step": 2133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:40.105358", "step": 2133, "epoch": 2 }, { "type": "loss", "content": 0.08130506426095963, "timestamp": "2025-09-02 14:16:40.107820", "step": 2134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:40.160955", "step": 2134, "epoch": 2 }, { "type": "loss", "content": 0.2772088944911957, "timestamp": "2025-09-02 14:16:40.163317", "step": 2135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:40.216633", "step": 2135, "epoch": 2 }, { "type": "loss", "content": 0.17145875096321106, "timestamp": "2025-09-02 14:16:40.222558", "step": 2136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:40.275764", "step": 2136, "epoch": 2 }, { "type": "loss", "content": 0.19795292615890503, "timestamp": "2025-09-02 14:16:40.278091", "step": 2137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:40.331229", "step": 2137, "epoch": 2 }, { "type": "loss", "content": 0.16409890353679657, "timestamp": "2025-09-02 14:16:40.333821", "step": 2138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:40.386768", "step": 2138, "epoch": 2 }, { "type": "loss", "content": 0.08971104025840759, "timestamp": "2025-09-02 14:16:40.389091", "step": 2139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:40.442439", "step": 2139, "epoch": 2 }, { "type": "loss", "content": 0.19643428921699524, "timestamp": "2025-09-02 14:16:40.448626", "step": 2140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:42.327664", "step": 2140, "epoch": 2 }, { "type": "pplx", "content": 8486.585296232806, "timestamp": "2025-09-02 14:16:42.329896", "step": 2140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.383403", "step": 2140, "epoch": 2 }, { "type": "loss", "content": 0.2463015466928482, "timestamp": "2025-09-02 14:16:42.387178", "step": 2141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.442226", "step": 2141, "epoch": 2 }, { "type": "loss", "content": 0.27791503071784973, "timestamp": "2025-09-02 14:16:42.444850", "step": 2142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.497584", "step": 2142, "epoch": 2 }, { "type": "loss", "content": 0.06083592399954796, "timestamp": "2025-09-02 14:16:42.499998", "step": 2143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.553166", "step": 2143, "epoch": 2 }, { "type": "loss", "content": 0.21286597847938538, "timestamp": "2025-09-02 14:16:42.559251", "step": 2144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.611624", "step": 2144, "epoch": 2 }, { "type": "loss", "content": 0.1900375783443451, "timestamp": "2025-09-02 14:16:42.613884", "step": 2145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.666411", "step": 2145, "epoch": 2 }, { "type": "loss", "content": 0.1849222183227539, "timestamp": "2025-09-02 14:16:42.668683", "step": 2146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:42.721866", "step": 2146, "epoch": 2 }, { "type": "loss", "content": 0.07294631004333496, "timestamp": "2025-09-02 14:16:42.724219", "step": 2147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.777109", "step": 2147, "epoch": 2 }, { "type": "loss", "content": 0.16402828693389893, "timestamp": "2025-09-02 14:16:42.783222", "step": 2148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:42.836093", "step": 2148, "epoch": 2 }, { "type": "loss", "content": 0.23572638630867004, "timestamp": "2025-09-02 14:16:42.838417", "step": 2149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.892525", "step": 2149, "epoch": 2 }, { "type": "loss", "content": 0.13323859870433807, "timestamp": "2025-09-02 14:16:42.894966", "step": 2150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:42.948098", "step": 2150, "epoch": 2 }, { "type": "loss", "content": 0.1521712690591812, "timestamp": "2025-09-02 14:16:42.950747", "step": 2151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.004342", "step": 2151, "epoch": 2 }, { "type": "loss", "content": 0.18817837536334991, "timestamp": "2025-09-02 14:16:43.010140", "step": 2152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.062605", "step": 2152, "epoch": 2 }, { "type": "loss", "content": 0.19086650013923645, "timestamp": "2025-09-02 14:16:43.065612", "step": 2153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.120345", "step": 2153, "epoch": 2 }, { "type": "loss", "content": 0.19703464210033417, "timestamp": "2025-09-02 14:16:43.122629", "step": 2154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.176268", "step": 2154, "epoch": 2 }, { "type": "loss", "content": 0.1736181080341339, "timestamp": "2025-09-02 14:16:43.178596", "step": 2155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:43.231799", "step": 2155, "epoch": 2 }, { "type": "loss", "content": 0.29498159885406494, "timestamp": "2025-09-02 14:16:43.237844", "step": 2156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.290055", "step": 2156, "epoch": 2 }, { "type": "loss", "content": 0.1851561963558197, "timestamp": "2025-09-02 14:16:43.292423", "step": 2157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:43.345175", "step": 2157, "epoch": 2 }, { "type": "loss", "content": 0.12211586534976959, "timestamp": "2025-09-02 14:16:43.348035", "step": 2158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:43.401168", "step": 2158, "epoch": 2 }, { "type": "loss", "content": 0.11737710982561111, "timestamp": "2025-09-02 14:16:43.403703", "step": 2159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:43.456735", "step": 2159, "epoch": 2 }, { "type": "loss", "content": 0.16767185926437378, "timestamp": "2025-09-02 14:16:43.462600", "step": 2160, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:45.342172", "step": 2160, "epoch": 2 }, { "type": "pplx", "content": 8561.920150671873, "timestamp": "2025-09-02 14:16:45.344899", "step": 2160, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2160", "timestamp": "2025-09-02 14:16:45.831391", "step": 2160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:45.888209", "step": 2160, "epoch": 2 }, { "type": "loss", "content": 0.21794533729553223, "timestamp": "2025-09-02 14:16:45.890814", "step": 2161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:45.945173", "step": 2161, "epoch": 2 }, { "type": "loss", "content": 0.22590528428554535, "timestamp": "2025-09-02 14:16:45.947782", "step": 2162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.000889", "step": 2162, "epoch": 2 }, { "type": "loss", "content": 0.1806095987558365, "timestamp": "2025-09-02 14:16:46.003360", "step": 2163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:46.056406", "step": 2163, "epoch": 2 }, { "type": "loss", "content": 0.1330559253692627, "timestamp": "2025-09-02 14:16:46.062687", "step": 2164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.114798", "step": 2164, "epoch": 2 }, { "type": "loss", "content": 0.15306809544563293, "timestamp": "2025-09-02 14:16:46.117017", "step": 2165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.169746", "step": 2165, "epoch": 2 }, { "type": "loss", "content": 0.19233158230781555, "timestamp": "2025-09-02 14:16:46.172145", "step": 2166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.224865", "step": 2166, "epoch": 2 }, { "type": "loss", "content": 0.08779321610927582, "timestamp": "2025-09-02 14:16:46.227388", "step": 2167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.280321", "step": 2167, "epoch": 2 }, { "type": "loss", "content": 0.16827557981014252, "timestamp": "2025-09-02 14:16:46.286539", "step": 2168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.338920", "step": 2168, "epoch": 2 }, { "type": "loss", "content": 0.058072805404663086, "timestamp": "2025-09-02 14:16:46.341399", "step": 2169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.394441", "step": 2169, "epoch": 2 }, { "type": "loss", "content": 0.15392708778381348, "timestamp": "2025-09-02 14:16:46.397204", "step": 2170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:46.451054", "step": 2170, "epoch": 2 }, { "type": "loss", "content": 0.19597198069095612, "timestamp": "2025-09-02 14:16:46.453601", "step": 2171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.506375", "step": 2171, "epoch": 2 }, { "type": "loss", "content": 0.22746504843235016, "timestamp": "2025-09-02 14:16:46.512655", "step": 2172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.564914", "step": 2172, "epoch": 2 }, { "type": "loss", "content": 0.17754727602005005, "timestamp": "2025-09-02 14:16:46.567398", "step": 2173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.621265", "step": 2173, "epoch": 2 }, { "type": "loss", "content": 0.13814830780029297, "timestamp": "2025-09-02 14:16:46.623376", "step": 2174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.676102", "step": 2174, "epoch": 2 }, { "type": "loss", "content": 0.2725941836833954, "timestamp": "2025-09-02 14:16:46.678625", "step": 2175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.731369", "step": 2175, "epoch": 2 }, { "type": "loss", "content": 0.17599956691265106, "timestamp": "2025-09-02 14:16:46.737316", "step": 2176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.791427", "step": 2176, "epoch": 2 }, { "type": "loss", "content": 0.253390908241272, "timestamp": "2025-09-02 14:16:46.793747", "step": 2177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.846718", "step": 2177, "epoch": 2 }, { "type": "loss", "content": 0.24014216661453247, "timestamp": "2025-09-02 14:16:46.849323", "step": 2178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.902734", "step": 2178, "epoch": 2 }, { "type": "loss", "content": 0.14413748681545258, "timestamp": "2025-09-02 14:16:46.904964", "step": 2179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:46.958365", "step": 2179, "epoch": 2 }, { "type": "loss", "content": 0.13174913823604584, "timestamp": "2025-09-02 14:16:46.964365", "step": 2180, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:48.856650", "step": 2180, "epoch": 2 }, { "type": "pplx", "content": 8549.377492273028, "timestamp": "2025-09-02 14:16:48.859913", "step": 2180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:48.913587", "step": 2180, "epoch": 2 }, { "type": "loss", "content": 0.1721327155828476, "timestamp": "2025-09-02 14:16:48.916504", "step": 2181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:48.970566", "step": 2181, "epoch": 2 }, { "type": "loss", "content": 0.31343796849250793, "timestamp": "2025-09-02 14:16:48.973020", "step": 2182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.026792", "step": 2182, "epoch": 2 }, { "type": "loss", "content": 0.18813374638557434, "timestamp": "2025-09-02 14:16:49.029069", "step": 2183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.082463", "step": 2183, "epoch": 2 }, { "type": "loss", "content": 0.2883349657058716, "timestamp": "2025-09-02 14:16:49.088769", "step": 2184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.141702", "step": 2184, "epoch": 2 }, { "type": "loss", "content": 0.31172025203704834, "timestamp": "2025-09-02 14:16:49.144228", "step": 2185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:49.198237", "step": 2185, "epoch": 2 }, { "type": "loss", "content": 0.21886524558067322, "timestamp": "2025-09-02 14:16:49.200605", "step": 2186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:49.254518", "step": 2186, "epoch": 2 }, { "type": "loss", "content": 0.17962656915187836, "timestamp": "2025-09-02 14:16:49.257050", "step": 2187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:49.310342", "step": 2187, "epoch": 2 }, { "type": "loss", "content": 0.13854138553142548, "timestamp": "2025-09-02 14:16:49.316769", "step": 2188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:49.369869", "step": 2188, "epoch": 2 }, { "type": "loss", "content": 0.1237589493393898, "timestamp": "2025-09-02 14:16:49.372731", "step": 2189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.429304", "step": 2189, "epoch": 2 }, { "type": "loss", "content": 0.09385298937559128, "timestamp": "2025-09-02 14:16:49.431599", "step": 2190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.485035", "step": 2190, "epoch": 2 }, { "type": "loss", "content": 0.18028032779693604, "timestamp": "2025-09-02 14:16:49.487479", "step": 2191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.541297", "step": 2191, "epoch": 2 }, { "type": "loss", "content": 0.08850746601819992, "timestamp": "2025-09-02 14:16:49.547490", "step": 2192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.600121", "step": 2192, "epoch": 2 }, { "type": "loss", "content": 0.14804531633853912, "timestamp": "2025-09-02 14:16:49.602570", "step": 2193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:49.656053", "step": 2193, "epoch": 2 }, { "type": "loss", "content": 0.11116036027669907, "timestamp": "2025-09-02 14:16:49.658305", "step": 2194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.711820", "step": 2194, "epoch": 2 }, { "type": "loss", "content": 0.1714933216571808, "timestamp": "2025-09-02 14:16:49.714219", "step": 2195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.767611", "step": 2195, "epoch": 2 }, { "type": "loss", "content": 0.27390843629837036, "timestamp": "2025-09-02 14:16:49.773728", "step": 2196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.826280", "step": 2196, "epoch": 2 }, { "type": "loss", "content": 0.1754336804151535, "timestamp": "2025-09-02 14:16:49.828583", "step": 2197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.881592", "step": 2197, "epoch": 2 }, { "type": "loss", "content": 0.23191963136196136, "timestamp": "2025-09-02 14:16:49.883503", "step": 2198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:49.936460", "step": 2198, "epoch": 2 }, { "type": "loss", "content": 0.04575252905488014, "timestamp": "2025-09-02 14:16:49.938937", "step": 2199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:49.992412", "step": 2199, "epoch": 2 }, { "type": "loss", "content": 0.26044198870658875, "timestamp": "2025-09-02 14:16:49.998463", "step": 2200, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:51.882103", "step": 2200, "epoch": 2 }, { "type": "pplx", "content": 8520.99261402752, "timestamp": "2025-09-02 14:16:51.884609", "step": 2200, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2200", "timestamp": "2025-09-02 14:16:52.377003", "step": 2200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.432428", "step": 2200, "epoch": 2 }, { "type": "loss", "content": 0.06907603144645691, "timestamp": "2025-09-02 14:16:52.434885", "step": 2201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.488782", "step": 2201, "epoch": 2 }, { "type": "loss", "content": 0.16853530704975128, "timestamp": "2025-09-02 14:16:52.491176", "step": 2202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:52.544824", "step": 2202, "epoch": 2 }, { "type": "loss", "content": 0.1509072333574295, "timestamp": "2025-09-02 14:16:52.548624", "step": 2203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.604244", "step": 2203, "epoch": 2 }, { "type": "loss", "content": 0.14219073951244354, "timestamp": "2025-09-02 14:16:52.615681", "step": 2204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.679373", "step": 2204, "epoch": 2 }, { "type": "loss", "content": 0.3001638948917389, "timestamp": "2025-09-02 14:16:52.681697", "step": 2205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:52.736837", "step": 2205, "epoch": 2 }, { "type": "loss", "content": 0.1800740510225296, "timestamp": "2025-09-02 14:16:52.739164", "step": 2206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.792515", "step": 2206, "epoch": 2 }, { "type": "loss", "content": 0.2003937065601349, "timestamp": "2025-09-02 14:16:52.794979", "step": 2207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.848090", "step": 2207, "epoch": 2 }, { "type": "loss", "content": 0.18119023740291595, "timestamp": "2025-09-02 14:16:52.854124", "step": 2208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:52.907056", "step": 2208, "epoch": 2 }, { "type": "loss", "content": 0.19043385982513428, "timestamp": "2025-09-02 14:16:52.909364", "step": 2209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:52.962717", "step": 2209, "epoch": 2 }, { "type": "loss", "content": 0.09870438277721405, "timestamp": "2025-09-02 14:16:52.965310", "step": 2210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.018642", "step": 2210, "epoch": 2 }, { "type": "loss", "content": 0.23826557397842407, "timestamp": "2025-09-02 14:16:53.022412", "step": 2211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.076276", "step": 2211, "epoch": 2 }, { "type": "loss", "content": 0.13537658751010895, "timestamp": "2025-09-02 14:16:53.082320", "step": 2212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.134502", "step": 2212, "epoch": 2 }, { "type": "loss", "content": 0.21277526021003723, "timestamp": "2025-09-02 14:16:53.136964", "step": 2213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:53.190682", "step": 2213, "epoch": 2 }, { "type": "loss", "content": 0.2041429877281189, "timestamp": "2025-09-02 14:16:53.193171", "step": 2214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.246361", "step": 2214, "epoch": 2 }, { "type": "loss", "content": 0.16511383652687073, "timestamp": "2025-09-02 14:16:53.248539", "step": 2215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.301636", "step": 2215, "epoch": 2 }, { "type": "loss", "content": 0.10561374574899673, "timestamp": "2025-09-02 14:16:53.308379", "step": 2216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:16:53.362497", "step": 2216, "epoch": 2 }, { "type": "loss", "content": 0.2989220917224884, "timestamp": "2025-09-02 14:16:53.364831", "step": 2217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:53.417287", "step": 2217, "epoch": 2 }, { "type": "loss", "content": 0.12876497209072113, "timestamp": "2025-09-02 14:16:53.419689", "step": 2218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:53.472893", "step": 2218, "epoch": 2 }, { "type": "loss", "content": 0.19014668464660645, "timestamp": "2025-09-02 14:16:53.475362", "step": 2219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:53.529124", "step": 2219, "epoch": 2 }, { "type": "loss", "content": 0.1338798701763153, "timestamp": "2025-09-02 14:16:53.535636", "step": 2220, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:55.440666", "step": 2220, "epoch": 2 }, { "type": "pplx", "content": 8541.95153235018, "timestamp": "2025-09-02 14:16:55.443011", "step": 2220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.495982", "step": 2220, "epoch": 2 }, { "type": "loss", "content": 0.1677619218826294, "timestamp": "2025-09-02 14:16:55.498819", "step": 2221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.552543", "step": 2221, "epoch": 2 }, { "type": "loss", "content": 0.12134654819965363, "timestamp": "2025-09-02 14:16:55.555041", "step": 2222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:55.612939", "step": 2222, "epoch": 2 }, { "type": "loss", "content": 0.1482154130935669, "timestamp": "2025-09-02 14:16:55.615598", "step": 2223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.669296", "step": 2223, "epoch": 2 }, { "type": "loss", "content": 0.4098432660102844, "timestamp": "2025-09-02 14:16:55.675882", "step": 2224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:55.735187", "step": 2224, "epoch": 2 }, { "type": "loss", "content": 0.40486249327659607, "timestamp": "2025-09-02 14:16:55.745069", "step": 2225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.803728", "step": 2225, "epoch": 2 }, { "type": "loss", "content": 0.15182463824748993, "timestamp": "2025-09-02 14:16:55.805799", "step": 2226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.864186", "step": 2226, "epoch": 2 }, { "type": "loss", "content": 0.2423158586025238, "timestamp": "2025-09-02 14:16:55.867442", "step": 2227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:55.921262", "step": 2227, "epoch": 2 }, { "type": "loss", "content": 0.2000105232000351, "timestamp": "2025-09-02 14:16:55.927708", "step": 2228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:55.983681", "step": 2228, "epoch": 2 }, { "type": "loss", "content": 0.1322571486234665, "timestamp": "2025-09-02 14:16:55.987336", "step": 2229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:56.041048", "step": 2229, "epoch": 2 }, { "type": "loss", "content": 0.14535550773143768, "timestamp": "2025-09-02 14:16:56.045036", "step": 2230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.098403", "step": 2230, "epoch": 2 }, { "type": "loss", "content": 0.14265036582946777, "timestamp": "2025-09-02 14:16:56.100735", "step": 2231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:56.161149", "step": 2231, "epoch": 2 }, { "type": "loss", "content": 0.09863785654306412, "timestamp": "2025-09-02 14:16:56.173765", "step": 2232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:56.230633", "step": 2232, "epoch": 2 }, { "type": "loss", "content": 0.26660123467445374, "timestamp": "2025-09-02 14:16:56.233081", "step": 2233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.286007", "step": 2233, "epoch": 2 }, { "type": "loss", "content": 0.27286460995674133, "timestamp": "2025-09-02 14:16:56.288713", "step": 2234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:56.342313", "step": 2234, "epoch": 2 }, { "type": "loss", "content": 0.18789519369602203, "timestamp": "2025-09-02 14:16:56.344953", "step": 2235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.398769", "step": 2235, "epoch": 2 }, { "type": "loss", "content": 0.2600671350955963, "timestamp": "2025-09-02 14:16:56.405599", "step": 2236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.458576", "step": 2236, "epoch": 2 }, { "type": "loss", "content": 0.20354989171028137, "timestamp": "2025-09-02 14:16:56.460774", "step": 2237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.513355", "step": 2237, "epoch": 2 }, { "type": "loss", "content": 0.09848403930664062, "timestamp": "2025-09-02 14:16:56.516039", "step": 2238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:56.569579", "step": 2238, "epoch": 2 }, { "type": "loss", "content": 0.2452302724123001, "timestamp": "2025-09-02 14:16:56.572134", "step": 2239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:56.626627", "step": 2239, "epoch": 2 }, { "type": "loss", "content": 0.25161322951316833, "timestamp": "2025-09-02 14:16:56.632578", "step": 2240, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:16:58.529196", "step": 2240, "epoch": 2 }, { "type": "pplx", "content": 8562.688750958685, "timestamp": "2025-09-02 14:16:58.531294", "step": 2240, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2240", "timestamp": "2025-09-02 14:16:58.908589", "step": 2240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:58.965005", "step": 2240, "epoch": 2 }, { "type": "loss", "content": 0.11977759748697281, "timestamp": "2025-09-02 14:16:58.967298", "step": 2241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:59.021445", "step": 2241, "epoch": 2 }, { "type": "loss", "content": 0.25033068656921387, "timestamp": "2025-09-02 14:16:59.023610", "step": 2242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.077839", "step": 2242, "epoch": 2 }, { "type": "loss", "content": 0.25992217659950256, "timestamp": "2025-09-02 14:16:59.079992", "step": 2243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.132622", "step": 2243, "epoch": 2 }, { "type": "loss", "content": 0.06055991351604462, "timestamp": "2025-09-02 14:16:59.138919", "step": 2244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.191780", "step": 2244, "epoch": 2 }, { "type": "loss", "content": 0.3270051181316376, "timestamp": "2025-09-02 14:16:59.194205", "step": 2245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.246984", "step": 2245, "epoch": 2 }, { "type": "loss", "content": 0.20255134999752045, "timestamp": "2025-09-02 14:16:59.249384", "step": 2246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.302818", "step": 2246, "epoch": 2 }, { "type": "loss", "content": 0.2916842997074127, "timestamp": "2025-09-02 14:16:59.305266", "step": 2247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:59.358507", "step": 2247, "epoch": 2 }, { "type": "loss", "content": 0.29163819551467896, "timestamp": "2025-09-02 14:16:59.364620", "step": 2248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.417316", "step": 2248, "epoch": 2 }, { "type": "loss", "content": 0.30552753806114197, "timestamp": "2025-09-02 14:16:59.419482", "step": 2249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:59.473841", "step": 2249, "epoch": 2 }, { "type": "loss", "content": 0.13245955109596252, "timestamp": "2025-09-02 14:16:59.476336", "step": 2250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.531136", "step": 2250, "epoch": 2 }, { "type": "loss", "content": 0.17228204011917114, "timestamp": "2025-09-02 14:16:59.533983", "step": 2251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:16:59.587084", "step": 2251, "epoch": 2 }, { "type": "loss", "content": 0.18011808395385742, "timestamp": "2025-09-02 14:16:59.593309", "step": 2252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:16:59.648724", "step": 2252, "epoch": 2 }, { "type": "loss", "content": 0.11139411479234695, "timestamp": "2025-09-02 14:16:59.651044", "step": 2253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.704071", "step": 2253, "epoch": 2 }, { "type": "loss", "content": 0.20810647308826447, "timestamp": "2025-09-02 14:16:59.707300", "step": 2254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.761461", "step": 2254, "epoch": 2 }, { "type": "loss", "content": 0.22527670860290527, "timestamp": "2025-09-02 14:16:59.763748", "step": 2255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.817041", "step": 2255, "epoch": 2 }, { "type": "loss", "content": 0.2208562046289444, "timestamp": "2025-09-02 14:16:59.823047", "step": 2256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.875871", "step": 2256, "epoch": 2 }, { "type": "loss", "content": 0.20682057738304138, "timestamp": "2025-09-02 14:16:59.878223", "step": 2257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.931673", "step": 2257, "epoch": 2 }, { "type": "loss", "content": 0.2967672049999237, "timestamp": "2025-09-02 14:16:59.933761", "step": 2258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:16:59.987019", "step": 2258, "epoch": 2 }, { "type": "loss", "content": 0.14740459620952606, "timestamp": "2025-09-02 14:16:59.989289", "step": 2259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:00.042396", "step": 2259, "epoch": 2 }, { "type": "loss", "content": 0.3829834759235382, "timestamp": "2025-09-02 14:17:00.048157", "step": 2260, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:01.952387", "step": 2260, "epoch": 2 }, { "type": "pplx", "content": 8507.258553105587, "timestamp": "2025-09-02 14:17:01.954792", "step": 2260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.008262", "step": 2260, "epoch": 2 }, { "type": "loss", "content": 0.10405680537223816, "timestamp": "2025-09-02 14:17:02.010839", "step": 2261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:02.064482", "step": 2261, "epoch": 2 }, { "type": "loss", "content": 0.10828039050102234, "timestamp": "2025-09-02 14:17:02.066907", "step": 2262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.120630", "step": 2262, "epoch": 2 }, { "type": "loss", "content": 0.12791851162910461, "timestamp": "2025-09-02 14:17:02.122936", "step": 2263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.175603", "step": 2263, "epoch": 2 }, { "type": "loss", "content": 0.1362743079662323, "timestamp": "2025-09-02 14:17:02.182389", "step": 2264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:02.235594", "step": 2264, "epoch": 2 }, { "type": "loss", "content": 0.1728859692811966, "timestamp": "2025-09-02 14:17:02.238507", "step": 2265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.294592", "step": 2265, "epoch": 2 }, { "type": "loss", "content": 0.15601401031017303, "timestamp": "2025-09-02 14:17:02.297025", "step": 2266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.350132", "step": 2266, "epoch": 2 }, { "type": "loss", "content": 0.17831698060035706, "timestamp": "2025-09-02 14:17:02.352426", "step": 2267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.405685", "step": 2267, "epoch": 2 }, { "type": "loss", "content": 0.24433037638664246, "timestamp": "2025-09-02 14:17:02.411815", "step": 2268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.463959", "step": 2268, "epoch": 2 }, { "type": "loss", "content": 0.10969919711351395, "timestamp": "2025-09-02 14:17:02.466292", "step": 2269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.519152", "step": 2269, "epoch": 2 }, { "type": "loss", "content": 0.14028030633926392, "timestamp": "2025-09-02 14:17:02.521360", "step": 2270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.575174", "step": 2270, "epoch": 2 }, { "type": "loss", "content": 0.2091503143310547, "timestamp": "2025-09-02 14:17:02.577421", "step": 2271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:02.630989", "step": 2271, "epoch": 2 }, { "type": "loss", "content": 0.1534203588962555, "timestamp": "2025-09-02 14:17:02.637105", "step": 2272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:02.689779", "step": 2272, "epoch": 2 }, { "type": "loss", "content": 0.1456524282693863, "timestamp": "2025-09-02 14:17:02.692075", "step": 2273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.745102", "step": 2273, "epoch": 2 }, { "type": "loss", "content": 0.13922365009784698, "timestamp": "2025-09-02 14:17:02.747597", "step": 2274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.800665", "step": 2274, "epoch": 2 }, { "type": "loss", "content": 0.2163236141204834, "timestamp": "2025-09-02 14:17:02.803173", "step": 2275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.856931", "step": 2275, "epoch": 2 }, { "type": "loss", "content": 0.1569770872592926, "timestamp": "2025-09-02 14:17:02.863109", "step": 2276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.915660", "step": 2276, "epoch": 2 }, { "type": "loss", "content": 0.2344636172056198, "timestamp": "2025-09-02 14:17:02.917906", "step": 2277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:02.970887", "step": 2277, "epoch": 2 }, { "type": "loss", "content": 0.15863335132598877, "timestamp": "2025-09-02 14:17:02.973051", "step": 2278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:03.026735", "step": 2278, "epoch": 2 }, { "type": "loss", "content": 0.3004847466945648, "timestamp": "2025-09-02 14:17:03.029109", "step": 2279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:03.082284", "step": 2279, "epoch": 2 }, { "type": "loss", "content": 0.15870308876037598, "timestamp": "2025-09-02 14:17:03.088890", "step": 2280, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:04.988583", "step": 2280, "epoch": 2 }, { "type": "pplx", "content": 8430.09828677965, "timestamp": "2025-09-02 14:17:04.990963", "step": 2280, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2280", "timestamp": "2025-09-02 14:17:05.394131", "step": 2280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.455293", "step": 2280, "epoch": 2 }, { "type": "loss", "content": 0.1525176763534546, "timestamp": "2025-09-02 14:17:05.457642", "step": 2281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.511839", "step": 2281, "epoch": 2 }, { "type": "loss", "content": 0.08821406960487366, "timestamp": "2025-09-02 14:17:05.514488", "step": 2282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.568531", "step": 2282, "epoch": 2 }, { "type": "loss", "content": 0.16624167561531067, "timestamp": "2025-09-02 14:17:05.570937", "step": 2283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:05.624887", "step": 2283, "epoch": 2 }, { "type": "loss", "content": 0.2435036450624466, "timestamp": "2025-09-02 14:17:05.631276", "step": 2284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.684263", "step": 2284, "epoch": 2 }, { "type": "loss", "content": 0.24351383745670319, "timestamp": "2025-09-02 14:17:05.686948", "step": 2285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.739913", "step": 2285, "epoch": 2 }, { "type": "loss", "content": 0.17122070491313934, "timestamp": "2025-09-02 14:17:05.742432", "step": 2286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.795816", "step": 2286, "epoch": 2 }, { "type": "loss", "content": 0.17731046676635742, "timestamp": "2025-09-02 14:17:05.798246", "step": 2287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.850926", "step": 2287, "epoch": 2 }, { "type": "loss", "content": 0.15397456288337708, "timestamp": "2025-09-02 14:17:05.857100", "step": 2288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.909332", "step": 2288, "epoch": 2 }, { "type": "loss", "content": 0.203558087348938, "timestamp": "2025-09-02 14:17:05.911633", "step": 2289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:05.965512", "step": 2289, "epoch": 2 }, { "type": "loss", "content": 0.14655278623104095, "timestamp": "2025-09-02 14:17:05.968003", "step": 2290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.021262", "step": 2290, "epoch": 2 }, { "type": "loss", "content": 0.2808241546154022, "timestamp": "2025-09-02 14:17:06.023887", "step": 2291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.077113", "step": 2291, "epoch": 2 }, { "type": "loss", "content": 0.21651889383792877, "timestamp": "2025-09-02 14:17:06.083085", "step": 2292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.138197", "step": 2292, "epoch": 2 }, { "type": "loss", "content": 0.15340593457221985, "timestamp": "2025-09-02 14:17:06.140725", "step": 2293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.193514", "step": 2293, "epoch": 2 }, { "type": "loss", "content": 0.11611995100975037, "timestamp": "2025-09-02 14:17:06.195874", "step": 2294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.249185", "step": 2294, "epoch": 2 }, { "type": "loss", "content": 0.2653701901435852, "timestamp": "2025-09-02 14:17:06.251534", "step": 2295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.305270", "step": 2295, "epoch": 2 }, { "type": "loss", "content": 0.19801253080368042, "timestamp": "2025-09-02 14:17:06.311262", "step": 2296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.363847", "step": 2296, "epoch": 2 }, { "type": "loss", "content": 0.2747690677642822, "timestamp": "2025-09-02 14:17:06.366087", "step": 2297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.419568", "step": 2297, "epoch": 2 }, { "type": "loss", "content": 0.3532094657421112, "timestamp": "2025-09-02 14:17:06.422023", "step": 2298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.475219", "step": 2298, "epoch": 2 }, { "type": "loss", "content": 0.20451679825782776, "timestamp": "2025-09-02 14:17:06.477714", "step": 2299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:06.531487", "step": 2299, "epoch": 2 }, { "type": "loss", "content": 0.19923318922519684, "timestamp": "2025-09-02 14:17:06.537396", "step": 2300, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:08.441376", "step": 2300, "epoch": 2 }, { "type": "pplx", "content": 8407.904130292805, "timestamp": "2025-09-02 14:17:08.443711", "step": 2300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:08.497708", "step": 2300, "epoch": 2 }, { "type": "loss", "content": 0.13584811985492706, "timestamp": "2025-09-02 14:17:08.500094", "step": 2301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:08.554289", "step": 2301, "epoch": 2 }, { "type": "loss", "content": 0.17185133695602417, "timestamp": "2025-09-02 14:17:08.556618", "step": 2302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:08.609285", "step": 2302, "epoch": 2 }, { "type": "loss", "content": 0.15402783453464508, "timestamp": "2025-09-02 14:17:08.611275", "step": 2303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:08.664506", "step": 2303, "epoch": 2 }, { "type": "loss", "content": 0.17774933576583862, "timestamp": "2025-09-02 14:17:08.671043", "step": 2304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:08.724335", "step": 2304, "epoch": 2 }, { "type": "loss", "content": 0.12824903428554535, "timestamp": "2025-09-02 14:17:08.726546", "step": 2305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:08.779746", "step": 2305, "epoch": 2 }, { "type": "loss", "content": 0.12333064526319504, "timestamp": "2025-09-02 14:17:08.781836", "step": 2306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:08.835115", "step": 2306, "epoch": 2 }, { "type": "loss", "content": 0.12587034702301025, "timestamp": "2025-09-02 14:17:08.837379", "step": 2307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:08.891291", "step": 2307, "epoch": 2 }, { "type": "loss", "content": 0.2915303707122803, "timestamp": "2025-09-02 14:17:08.897443", "step": 2308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:08.950601", "step": 2308, "epoch": 2 }, { "type": "loss", "content": 0.26086369156837463, "timestamp": "2025-09-02 14:17:08.959278", "step": 2309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:09.013138", "step": 2309, "epoch": 2 }, { "type": "loss", "content": 0.1758197844028473, "timestamp": "2025-09-02 14:17:09.015614", "step": 2310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:09.068601", "step": 2310, "epoch": 2 }, { "type": "loss", "content": 0.1749664545059204, "timestamp": "2025-09-02 14:17:09.070824", "step": 2311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:09.123946", "step": 2311, "epoch": 2 }, { "type": "loss", "content": 0.1483246088027954, "timestamp": "2025-09-02 14:17:09.130240", "step": 2312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:09.182568", "step": 2312, "epoch": 2 }, { "type": "loss", "content": 0.15702788531780243, "timestamp": "2025-09-02 14:17:09.185193", "step": 2313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:09.238310", "step": 2313, "epoch": 2 }, { "type": "loss", "content": 0.09373130649328232, "timestamp": "2025-09-02 14:17:09.240619", "step": 2314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:09.294351", "step": 2314, "epoch": 2 }, { "type": "loss", "content": 0.15963074564933777, "timestamp": "2025-09-02 14:17:09.296719", "step": 2315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:09.350146", "step": 2315, "epoch": 2 }, { "type": "loss", "content": 0.11424769461154938, "timestamp": "2025-09-02 14:17:09.356414", "step": 2316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:09.409583", "step": 2316, "epoch": 2 }, { "type": "loss", "content": 0.11938527971506119, "timestamp": "2025-09-02 14:17:09.411543", "step": 2317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:09.465064", "step": 2317, "epoch": 2 }, { "type": "loss", "content": 0.19827508926391602, "timestamp": "2025-09-02 14:17:09.469199", "step": 2318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:09.522673", "step": 2318, "epoch": 2 }, { "type": "loss", "content": 0.21077202260494232, "timestamp": "2025-09-02 14:17:09.524943", "step": 2319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:09.578878", "step": 2319, "epoch": 2 }, { "type": "loss", "content": 0.15479598939418793, "timestamp": "2025-09-02 14:17:09.584920", "step": 2320, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:11.476721", "step": 2320, "epoch": 2 }, { "type": "pplx", "content": 8458.56577410276, "timestamp": "2025-09-02 14:17:11.479176", "step": 2320, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2320", "timestamp": "2025-09-02 14:17:11.888349", "step": 2320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:11.943861", "step": 2320, "epoch": 2 }, { "type": "loss", "content": 0.2130316197872162, "timestamp": "2025-09-02 14:17:11.946025", "step": 2321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.000752", "step": 2321, "epoch": 2 }, { "type": "loss", "content": 0.2478100210428238, "timestamp": "2025-09-02 14:17:12.003293", "step": 2322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.058024", "step": 2322, "epoch": 2 }, { "type": "loss", "content": 0.22739757597446442, "timestamp": "2025-09-02 14:17:12.061236", "step": 2323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.115718", "step": 2323, "epoch": 2 }, { "type": "loss", "content": 0.15324051678180695, "timestamp": "2025-09-02 14:17:12.122021", "step": 2324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.174842", "step": 2324, "epoch": 2 }, { "type": "loss", "content": 0.18408344686031342, "timestamp": "2025-09-02 14:17:12.177216", "step": 2325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.230927", "step": 2325, "epoch": 2 }, { "type": "loss", "content": 0.2398451417684555, "timestamp": "2025-09-02 14:17:12.233397", "step": 2326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.286798", "step": 2326, "epoch": 2 }, { "type": "loss", "content": 0.14946196973323822, "timestamp": "2025-09-02 14:17:12.289048", "step": 2327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.342122", "step": 2327, "epoch": 2 }, { "type": "loss", "content": 0.15743812918663025, "timestamp": "2025-09-02 14:17:12.347948", "step": 2328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:12.401739", "step": 2328, "epoch": 2 }, { "type": "loss", "content": 0.1959870308637619, "timestamp": "2025-09-02 14:17:12.404308", "step": 2329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.457834", "step": 2329, "epoch": 2 }, { "type": "loss", "content": 0.22273696959018707, "timestamp": "2025-09-02 14:17:12.459900", "step": 2330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.513351", "step": 2330, "epoch": 2 }, { "type": "loss", "content": 0.11160798370838165, "timestamp": "2025-09-02 14:17:12.515851", "step": 2331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.570082", "step": 2331, "epoch": 2 }, { "type": "loss", "content": 0.1782943606376648, "timestamp": "2025-09-02 14:17:12.576130", "step": 2332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.629069", "step": 2332, "epoch": 2 }, { "type": "loss", "content": 0.20467209815979004, "timestamp": "2025-09-02 14:17:12.631275", "step": 2333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:12.684636", "step": 2333, "epoch": 2 }, { "type": "loss", "content": 0.13326402008533478, "timestamp": "2025-09-02 14:17:12.687007", "step": 2334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.740564", "step": 2334, "epoch": 2 }, { "type": "loss", "content": 0.148786723613739, "timestamp": "2025-09-02 14:17:12.742903", "step": 2335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.796221", "step": 2335, "epoch": 2 }, { "type": "loss", "content": 0.17425136268138885, "timestamp": "2025-09-02 14:17:12.802367", "step": 2336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:12.855312", "step": 2336, "epoch": 2 }, { "type": "loss", "content": 0.16079550981521606, "timestamp": "2025-09-02 14:17:12.857878", "step": 2337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:12.911121", "step": 2337, "epoch": 2 }, { "type": "loss", "content": 0.12640443444252014, "timestamp": "2025-09-02 14:17:12.914072", "step": 2338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:12.968354", "step": 2338, "epoch": 2 }, { "type": "loss", "content": 0.2278200089931488, "timestamp": "2025-09-02 14:17:12.972781", "step": 2339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:13.027449", "step": 2339, "epoch": 2 }, { "type": "loss", "content": 0.0964229479432106, "timestamp": "2025-09-02 14:17:13.033419", "step": 2340, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:14.940442", "step": 2340, "epoch": 2 }, { "type": "pplx", "content": 8736.807395611519, "timestamp": "2025-09-02 14:17:14.942733", "step": 2340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:14.996031", "step": 2340, "epoch": 2 }, { "type": "loss", "content": 0.25752386450767517, "timestamp": "2025-09-02 14:17:14.998424", "step": 2341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.053026", "step": 2341, "epoch": 2 }, { "type": "loss", "content": 0.2896779179573059, "timestamp": "2025-09-02 14:17:15.055572", "step": 2342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.109063", "step": 2342, "epoch": 2 }, { "type": "loss", "content": 0.15423355996608734, "timestamp": "2025-09-02 14:17:15.111363", "step": 2343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.163958", "step": 2343, "epoch": 2 }, { "type": "loss", "content": 0.17839787900447845, "timestamp": "2025-09-02 14:17:15.173051", "step": 2344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.227274", "step": 2344, "epoch": 2 }, { "type": "loss", "content": 0.2809090316295624, "timestamp": "2025-09-02 14:17:15.229274", "step": 2345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.281237", "step": 2345, "epoch": 2 }, { "type": "loss", "content": 0.19912008941173553, "timestamp": "2025-09-02 14:17:15.283463", "step": 2346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.336191", "step": 2346, "epoch": 2 }, { "type": "loss", "content": 0.11119423806667328, "timestamp": "2025-09-02 14:17:15.338366", "step": 2347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.391622", "step": 2347, "epoch": 2 }, { "type": "loss", "content": 0.18170210719108582, "timestamp": "2025-09-02 14:17:15.397829", "step": 2348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.450406", "step": 2348, "epoch": 2 }, { "type": "loss", "content": 0.1406015306711197, "timestamp": "2025-09-02 14:17:15.452681", "step": 2349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.505090", "step": 2349, "epoch": 2 }, { "type": "loss", "content": 0.1603090763092041, "timestamp": "2025-09-02 14:17:15.507192", "step": 2350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.561172", "step": 2350, "epoch": 2 }, { "type": "loss", "content": 0.2045137882232666, "timestamp": "2025-09-02 14:17:15.563012", "step": 2351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.615184", "step": 2351, "epoch": 2 }, { "type": "loss", "content": 0.1899438202381134, "timestamp": "2025-09-02 14:17:15.620704", "step": 2352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.672853", "step": 2352, "epoch": 2 }, { "type": "loss", "content": 0.14841808378696442, "timestamp": "2025-09-02 14:17:15.675319", "step": 2353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.729085", "step": 2353, "epoch": 2 }, { "type": "loss", "content": 0.30981144309043884, "timestamp": "2025-09-02 14:17:15.731439", "step": 2354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.784161", "step": 2354, "epoch": 2 }, { "type": "loss", "content": 0.24735352396965027, "timestamp": "2025-09-02 14:17:15.786487", "step": 2355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.840031", "step": 2355, "epoch": 2 }, { "type": "loss", "content": 0.26461711525917053, "timestamp": "2025-09-02 14:17:15.845914", "step": 2356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:15.898350", "step": 2356, "epoch": 2 }, { "type": "loss", "content": 0.2387523204088211, "timestamp": "2025-09-02 14:17:15.900538", "step": 2357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:15.953811", "step": 2357, "epoch": 2 }, { "type": "loss", "content": 0.22780044376850128, "timestamp": "2025-09-02 14:17:15.956013", "step": 2358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:16.010906", "step": 2358, "epoch": 2 }, { "type": "loss", "content": 0.15855996310710907, "timestamp": "2025-09-02 14:17:16.012826", "step": 2359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:16.067104", "step": 2359, "epoch": 2 }, { "type": "loss", "content": 0.07899369299411774, "timestamp": "2025-09-02 14:17:16.072929", "step": 2360, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:17.937727", "step": 2360, "epoch": 2 }, { "type": "pplx", "content": 8880.884948420971, "timestamp": "2025-09-02 14:17:17.939754", "step": 2360, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2360", "timestamp": "2025-09-02 14:17:18.302844", "step": 2360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.359276", "step": 2360, "epoch": 2 }, { "type": "loss", "content": 0.12755480408668518, "timestamp": "2025-09-02 14:17:18.361225", "step": 2361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:18.423543", "step": 2361, "epoch": 2 }, { "type": "loss", "content": 0.3675791323184967, "timestamp": "2025-09-02 14:17:18.425734", "step": 2362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.478868", "step": 2362, "epoch": 2 }, { "type": "loss", "content": 0.19471004605293274, "timestamp": "2025-09-02 14:17:18.481203", "step": 2363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.534048", "step": 2363, "epoch": 2 }, { "type": "loss", "content": 0.2819225788116455, "timestamp": "2025-09-02 14:17:18.540243", "step": 2364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:18.594159", "step": 2364, "epoch": 2 }, { "type": "loss", "content": 0.1585155874490738, "timestamp": "2025-09-02 14:17:18.596488", "step": 2365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:18.649898", "step": 2365, "epoch": 2 }, { "type": "loss", "content": 0.18467028439044952, "timestamp": "2025-09-02 14:17:18.652108", "step": 2366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:18.705188", "step": 2366, "epoch": 2 }, { "type": "loss", "content": 0.20873305201530457, "timestamp": "2025-09-02 14:17:18.707307", "step": 2367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.759899", "step": 2367, "epoch": 2 }, { "type": "loss", "content": 0.1464056819677353, "timestamp": "2025-09-02 14:17:18.770236", "step": 2368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.823833", "step": 2368, "epoch": 2 }, { "type": "loss", "content": 0.22652466595172882, "timestamp": "2025-09-02 14:17:18.827182", "step": 2369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.887657", "step": 2369, "epoch": 2 }, { "type": "loss", "content": 0.28920507431030273, "timestamp": "2025-09-02 14:17:18.890174", "step": 2370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:18.953813", "step": 2370, "epoch": 2 }, { "type": "loss", "content": 0.09898601472377777, "timestamp": "2025-09-02 14:17:18.955621", "step": 2371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.014021", "step": 2371, "epoch": 2 }, { "type": "loss", "content": 0.09722872823476791, "timestamp": "2025-09-02 14:17:19.019928", "step": 2372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.093297", "step": 2372, "epoch": 2 }, { "type": "loss", "content": 0.2919410765171051, "timestamp": "2025-09-02 14:17:19.095736", "step": 2373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.160252", "step": 2373, "epoch": 2 }, { "type": "loss", "content": 0.2559782862663269, "timestamp": "2025-09-02 14:17:19.162705", "step": 2374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:19.220577", "step": 2374, "epoch": 2 }, { "type": "loss", "content": 0.29243290424346924, "timestamp": "2025-09-02 14:17:19.223003", "step": 2375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.280484", "step": 2375, "epoch": 2 }, { "type": "loss", "content": 0.08171891421079636, "timestamp": "2025-09-02 14:17:19.286343", "step": 2376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.344612", "step": 2376, "epoch": 2 }, { "type": "loss", "content": 0.17648068070411682, "timestamp": "2025-09-02 14:17:19.346927", "step": 2377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.406335", "step": 2377, "epoch": 2 }, { "type": "loss", "content": 0.17943650484085083, "timestamp": "2025-09-02 14:17:19.408885", "step": 2378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.469796", "step": 2378, "epoch": 2 }, { "type": "loss", "content": 0.260780394077301, "timestamp": "2025-09-02 14:17:19.472315", "step": 2379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:19.536607", "step": 2379, "epoch": 2 }, { "type": "loss", "content": 0.14453189074993134, "timestamp": "2025-09-02 14:17:19.545592", "step": 2380, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:22.412054", "step": 2380, "epoch": 2 }, { "type": "pplx", "content": 8704.774357348115, "timestamp": "2025-09-02 14:17:22.415039", "step": 2380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:22.475787", "step": 2380, "epoch": 2 }, { "type": "loss", "content": 0.2140973061323166, "timestamp": "2025-09-02 14:17:22.478077", "step": 2381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.535238", "step": 2381, "epoch": 2 }, { "type": "loss", "content": 0.15710480511188507, "timestamp": "2025-09-02 14:17:22.537517", "step": 2382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.596083", "step": 2382, "epoch": 2 }, { "type": "loss", "content": 0.2108018696308136, "timestamp": "2025-09-02 14:17:22.598507", "step": 2383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.652087", "step": 2383, "epoch": 2 }, { "type": "loss", "content": 0.14824844896793365, "timestamp": "2025-09-02 14:17:22.658256", "step": 2384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.710690", "step": 2384, "epoch": 2 }, { "type": "loss", "content": 0.20933058857917786, "timestamp": "2025-09-02 14:17:22.713188", "step": 2385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:22.775594", "step": 2385, "epoch": 2 }, { "type": "loss", "content": 0.15816497802734375, "timestamp": "2025-09-02 14:17:22.778023", "step": 2386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.834117", "step": 2386, "epoch": 2 }, { "type": "loss", "content": 0.07622687518596649, "timestamp": "2025-09-02 14:17:22.836707", "step": 2387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:22.894225", "step": 2387, "epoch": 2 }, { "type": "loss", "content": 0.21483145654201508, "timestamp": "2025-09-02 14:17:22.901485", "step": 2388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:22.954883", "step": 2388, "epoch": 2 }, { "type": "loss", "content": 0.18163180351257324, "timestamp": "2025-09-02 14:17:22.957167", "step": 2389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.014784", "step": 2389, "epoch": 2 }, { "type": "loss", "content": 0.0725657120347023, "timestamp": "2025-09-02 14:17:23.017839", "step": 2390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:23.072042", "step": 2390, "epoch": 2 }, { "type": "loss", "content": 0.22455555200576782, "timestamp": "2025-09-02 14:17:23.074369", "step": 2391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.129056", "step": 2391, "epoch": 2 }, { "type": "loss", "content": 0.10720934718847275, "timestamp": "2025-09-02 14:17:23.140545", "step": 2392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.193087", "step": 2392, "epoch": 2 }, { "type": "loss", "content": 0.17479492723941803, "timestamp": "2025-09-02 14:17:23.195480", "step": 2393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.248662", "step": 2393, "epoch": 2 }, { "type": "loss", "content": 0.2110992968082428, "timestamp": "2025-09-02 14:17:23.251086", "step": 2394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.304006", "step": 2394, "epoch": 2 }, { "type": "loss", "content": 0.13071197271347046, "timestamp": "2025-09-02 14:17:23.308190", "step": 2395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:23.363781", "step": 2395, "epoch": 2 }, { "type": "loss", "content": 0.27646875381469727, "timestamp": "2025-09-02 14:17:23.369642", "step": 2396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.427295", "step": 2396, "epoch": 2 }, { "type": "loss", "content": 0.07264316082000732, "timestamp": "2025-09-02 14:17:23.429607", "step": 2397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.490487", "step": 2397, "epoch": 2 }, { "type": "loss", "content": 0.08206354826688766, "timestamp": "2025-09-02 14:17:23.492914", "step": 2398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:23.546783", "step": 2398, "epoch": 2 }, { "type": "loss", "content": 0.16342388093471527, "timestamp": "2025-09-02 14:17:23.551400", "step": 2399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:23.605736", "step": 2399, "epoch": 2 }, { "type": "loss", "content": 0.21103817224502563, "timestamp": "2025-09-02 14:17:23.614086", "step": 2400, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:25.547119", "step": 2400, "epoch": 2 }, { "type": "pplx", "content": 8678.967127770118, "timestamp": "2025-09-02 14:17:25.549347", "step": 2400, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2400", "timestamp": "2025-09-02 14:17:25.908185", "step": 2400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:25.964874", "step": 2400, "epoch": 2 }, { "type": "loss", "content": 0.12474822998046875, "timestamp": "2025-09-02 14:17:25.967240", "step": 2401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:26.021943", "step": 2401, "epoch": 2 }, { "type": "loss", "content": 0.16031667590141296, "timestamp": "2025-09-02 14:17:26.024238", "step": 2402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.078184", "step": 2402, "epoch": 2 }, { "type": "loss", "content": 0.10614752769470215, "timestamp": "2025-09-02 14:17:26.080493", "step": 2403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.134285", "step": 2403, "epoch": 2 }, { "type": "loss", "content": 0.11765092611312866, "timestamp": "2025-09-02 14:17:26.141777", "step": 2404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.194165", "step": 2404, "epoch": 2 }, { "type": "loss", "content": 0.19299103319644928, "timestamp": "2025-09-02 14:17:26.196603", "step": 2405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:26.250221", "step": 2405, "epoch": 2 }, { "type": "loss", "content": 0.12314549833536148, "timestamp": "2025-09-02 14:17:26.252376", "step": 2406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.305592", "step": 2406, "epoch": 2 }, { "type": "loss", "content": 0.26220810413360596, "timestamp": "2025-09-02 14:17:26.308026", "step": 2407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.361667", "step": 2407, "epoch": 2 }, { "type": "loss", "content": 0.14083737134933472, "timestamp": "2025-09-02 14:17:26.368046", "step": 2408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.421123", "step": 2408, "epoch": 2 }, { "type": "loss", "content": 0.22596865892410278, "timestamp": "2025-09-02 14:17:26.423520", "step": 2409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.477507", "step": 2409, "epoch": 2 }, { "type": "loss", "content": 0.1685769259929657, "timestamp": "2025-09-02 14:17:26.479835", "step": 2410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:26.533309", "step": 2410, "epoch": 2 }, { "type": "loss", "content": 0.2653995752334595, "timestamp": "2025-09-02 14:17:26.536419", "step": 2411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:26.590252", "step": 2411, "epoch": 2 }, { "type": "loss", "content": 0.10325127840042114, "timestamp": "2025-09-02 14:17:26.596303", "step": 2412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.649210", "step": 2412, "epoch": 2 }, { "type": "loss", "content": 0.34099340438842773, "timestamp": "2025-09-02 14:17:26.651599", "step": 2413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.706599", "step": 2413, "epoch": 2 }, { "type": "loss", "content": 0.2578793168067932, "timestamp": "2025-09-02 14:17:26.709088", "step": 2414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.762029", "step": 2414, "epoch": 2 }, { "type": "loss", "content": 0.16987788677215576, "timestamp": "2025-09-02 14:17:26.764403", "step": 2415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.818811", "step": 2415, "epoch": 2 }, { "type": "loss", "content": 0.3523900806903839, "timestamp": "2025-09-02 14:17:26.824909", "step": 2416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.878937", "step": 2416, "epoch": 2 }, { "type": "loss", "content": 0.14697273075580597, "timestamp": "2025-09-02 14:17:26.881500", "step": 2417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.934728", "step": 2417, "epoch": 2 }, { "type": "loss", "content": 0.17800146341323853, "timestamp": "2025-09-02 14:17:26.937280", "step": 2418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:26.990858", "step": 2418, "epoch": 2 }, { "type": "loss", "content": 0.170994833111763, "timestamp": "2025-09-02 14:17:26.993516", "step": 2419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:27.046770", "step": 2419, "epoch": 2 }, { "type": "loss", "content": 0.11230450123548508, "timestamp": "2025-09-02 14:17:27.053821", "step": 2420, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:28.984147", "step": 2420, "epoch": 2 }, { "type": "pplx", "content": 8928.287495780827, "timestamp": "2025-09-02 14:17:28.987291", "step": 2420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:29.041409", "step": 2420, "epoch": 2 }, { "type": "loss", "content": 0.14101973176002502, "timestamp": "2025-09-02 14:17:29.043745", "step": 2421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:29.097482", "step": 2421, "epoch": 2 }, { "type": "loss", "content": 0.11957092583179474, "timestamp": "2025-09-02 14:17:29.101802", "step": 2422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.170374", "step": 2422, "epoch": 2 }, { "type": "loss", "content": 0.26123547554016113, "timestamp": "2025-09-02 14:17:29.173896", "step": 2423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:29.228774", "step": 2423, "epoch": 2 }, { "type": "loss", "content": 0.20374584197998047, "timestamp": "2025-09-02 14:17:29.234905", "step": 2424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.287697", "step": 2424, "epoch": 2 }, { "type": "loss", "content": 0.1821206510066986, "timestamp": "2025-09-02 14:17:29.290110", "step": 2425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.343794", "step": 2425, "epoch": 2 }, { "type": "loss", "content": 0.23572666943073273, "timestamp": "2025-09-02 14:17:29.346845", "step": 2426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.400703", "step": 2426, "epoch": 2 }, { "type": "loss", "content": 0.21820105612277985, "timestamp": "2025-09-02 14:17:29.403416", "step": 2427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.457481", "step": 2427, "epoch": 2 }, { "type": "loss", "content": 0.12350670248270035, "timestamp": "2025-09-02 14:17:29.463618", "step": 2428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:29.520212", "step": 2428, "epoch": 2 }, { "type": "loss", "content": 0.1861865222454071, "timestamp": "2025-09-02 14:17:29.522742", "step": 2429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.580851", "step": 2429, "epoch": 2 }, { "type": "loss", "content": 0.20498672127723694, "timestamp": "2025-09-02 14:17:29.583594", "step": 2430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.642429", "step": 2430, "epoch": 2 }, { "type": "loss", "content": 0.10519459843635559, "timestamp": "2025-09-02 14:17:29.644723", "step": 2431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.704448", "step": 2431, "epoch": 2 }, { "type": "loss", "content": 0.11284515261650085, "timestamp": "2025-09-02 14:17:29.710430", "step": 2432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:29.763771", "step": 2432, "epoch": 2 }, { "type": "loss", "content": 0.04941633716225624, "timestamp": "2025-09-02 14:17:29.766229", "step": 2433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.820937", "step": 2433, "epoch": 2 }, { "type": "loss", "content": 0.04663838446140289, "timestamp": "2025-09-02 14:17:29.823683", "step": 2434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.878112", "step": 2434, "epoch": 2 }, { "type": "loss", "content": 0.2952808737754822, "timestamp": "2025-09-02 14:17:29.880387", "step": 2435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:29.933843", "step": 2435, "epoch": 2 }, { "type": "loss", "content": 0.19903601706027985, "timestamp": "2025-09-02 14:17:29.939782", "step": 2436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:29.991896", "step": 2436, "epoch": 2 }, { "type": "loss", "content": 0.09198466688394547, "timestamp": "2025-09-02 14:17:29.994152", "step": 2437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:30.047498", "step": 2437, "epoch": 2 }, { "type": "loss", "content": 0.12378840893507004, "timestamp": "2025-09-02 14:17:30.049804", "step": 2438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:30.102750", "step": 2438, "epoch": 2 }, { "type": "loss", "content": 0.1511690616607666, "timestamp": "2025-09-02 14:17:30.105125", "step": 2439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:30.158552", "step": 2439, "epoch": 2 }, { "type": "loss", "content": 0.18689627945423126, "timestamp": "2025-09-02 14:17:30.164671", "step": 2440, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:32.073547", "step": 2440, "epoch": 2 }, { "type": "pplx", "content": 9070.983756882382, "timestamp": "2025-09-02 14:17:32.079447", "step": 2440, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2440", "timestamp": "2025-09-02 14:17:32.487289", "step": 2440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.544138", "step": 2440, "epoch": 2 }, { "type": "loss", "content": 0.17916624248027802, "timestamp": "2025-09-02 14:17:32.546190", "step": 2441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.599642", "step": 2441, "epoch": 2 }, { "type": "loss", "content": 0.12317846715450287, "timestamp": "2025-09-02 14:17:32.601979", "step": 2442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:32.654586", "step": 2442, "epoch": 2 }, { "type": "loss", "content": 0.30071553587913513, "timestamp": "2025-09-02 14:17:32.657124", "step": 2443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.711682", "step": 2443, "epoch": 2 }, { "type": "loss", "content": 0.07703974097967148, "timestamp": "2025-09-02 14:17:32.719141", "step": 2444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.773089", "step": 2444, "epoch": 2 }, { "type": "loss", "content": 0.22215890884399414, "timestamp": "2025-09-02 14:17:32.775472", "step": 2445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.828811", "step": 2445, "epoch": 2 }, { "type": "loss", "content": 0.20542486011981964, "timestamp": "2025-09-02 14:17:32.831019", "step": 2446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.883791", "step": 2446, "epoch": 2 }, { "type": "loss", "content": 0.16678959131240845, "timestamp": "2025-09-02 14:17:32.886422", "step": 2447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.939410", "step": 2447, "epoch": 2 }, { "type": "loss", "content": 0.18733811378479004, "timestamp": "2025-09-02 14:17:32.945950", "step": 2448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:32.999449", "step": 2448, "epoch": 2 }, { "type": "loss", "content": 0.2318241149187088, "timestamp": "2025-09-02 14:17:33.002002", "step": 2449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:33.055003", "step": 2449, "epoch": 2 }, { "type": "loss", "content": 0.19476893544197083, "timestamp": "2025-09-02 14:17:33.057403", "step": 2450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.110062", "step": 2450, "epoch": 2 }, { "type": "loss", "content": 0.17377187311649323, "timestamp": "2025-09-02 14:17:33.113056", "step": 2451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.166450", "step": 2451, "epoch": 2 }, { "type": "loss", "content": 0.06166215240955353, "timestamp": "2025-09-02 14:17:33.172735", "step": 2452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.225332", "step": 2452, "epoch": 2 }, { "type": "loss", "content": 0.11999202519655228, "timestamp": "2025-09-02 14:17:33.228320", "step": 2453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.281380", "step": 2453, "epoch": 2 }, { "type": "loss", "content": 0.1715960055589676, "timestamp": "2025-09-02 14:17:33.283700", "step": 2454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.337163", "step": 2454, "epoch": 2 }, { "type": "loss", "content": 0.20783591270446777, "timestamp": "2025-09-02 14:17:33.339645", "step": 2455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.394531", "step": 2455, "epoch": 2 }, { "type": "loss", "content": 0.1844734251499176, "timestamp": "2025-09-02 14:17:33.400942", "step": 2456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.453196", "step": 2456, "epoch": 2 }, { "type": "loss", "content": 0.19090311229228973, "timestamp": "2025-09-02 14:17:33.456062", "step": 2457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.508821", "step": 2457, "epoch": 2 }, { "type": "loss", "content": 0.11114278435707092, "timestamp": "2025-09-02 14:17:33.511155", "step": 2458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.564795", "step": 2458, "epoch": 2 }, { "type": "loss", "content": 0.14566931128501892, "timestamp": "2025-09-02 14:17:33.567347", "step": 2459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:33.620783", "step": 2459, "epoch": 2 }, { "type": "loss", "content": 0.3144551217556, "timestamp": "2025-09-02 14:17:33.626721", "step": 2460, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:35.519755", "step": 2460, "epoch": 2 }, { "type": "pplx", "content": 9243.570666469339, "timestamp": "2025-09-02 14:17:35.522128", "step": 2460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:35.575301", "step": 2460, "epoch": 2 }, { "type": "loss", "content": 0.43669435381889343, "timestamp": "2025-09-02 14:17:35.577853", "step": 2461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:35.632860", "step": 2461, "epoch": 2 }, { "type": "loss", "content": 0.1267927587032318, "timestamp": "2025-09-02 14:17:35.634865", "step": 2462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:35.688023", "step": 2462, "epoch": 2 }, { "type": "loss", "content": 0.15399718284606934, "timestamp": "2025-09-02 14:17:35.690410", "step": 2463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:35.743846", "step": 2463, "epoch": 2 }, { "type": "loss", "content": 0.2806377410888672, "timestamp": "2025-09-02 14:17:35.750106", "step": 2464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:35.802220", "step": 2464, "epoch": 2 }, { "type": "loss", "content": 0.1577605903148651, "timestamp": "2025-09-02 14:17:35.804529", "step": 2465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:35.857812", "step": 2465, "epoch": 2 }, { "type": "loss", "content": 0.12262207269668579, "timestamp": "2025-09-02 14:17:35.861229", "step": 2466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:35.914524", "step": 2466, "epoch": 2 }, { "type": "loss", "content": 0.18381008505821228, "timestamp": "2025-09-02 14:17:35.916871", "step": 2467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:35.969966", "step": 2467, "epoch": 2 }, { "type": "loss", "content": 0.41190746426582336, "timestamp": "2025-09-02 14:17:35.976155", "step": 2468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.028658", "step": 2468, "epoch": 2 }, { "type": "loss", "content": 0.18128719925880432, "timestamp": "2025-09-02 14:17:36.030715", "step": 2469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:36.084173", "step": 2469, "epoch": 2 }, { "type": "loss", "content": 0.1504160612821579, "timestamp": "2025-09-02 14:17:36.086716", "step": 2470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:36.141390", "step": 2470, "epoch": 2 }, { "type": "loss", "content": 0.1775505542755127, "timestamp": "2025-09-02 14:17:36.143635", "step": 2471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.197089", "step": 2471, "epoch": 2 }, { "type": "loss", "content": 0.16476285457611084, "timestamp": "2025-09-02 14:17:36.203116", "step": 2472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.256157", "step": 2472, "epoch": 2 }, { "type": "loss", "content": 0.18175794184207916, "timestamp": "2025-09-02 14:17:36.258605", "step": 2473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:36.311939", "step": 2473, "epoch": 2 }, { "type": "loss", "content": 0.15479369461536407, "timestamp": "2025-09-02 14:17:36.314507", "step": 2474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:36.369093", "step": 2474, "epoch": 2 }, { "type": "loss", "content": 0.22306549549102783, "timestamp": "2025-09-02 14:17:36.371639", "step": 2475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.424951", "step": 2475, "epoch": 2 }, { "type": "loss", "content": 0.3903406262397766, "timestamp": "2025-09-02 14:17:36.430981", "step": 2476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.485942", "step": 2476, "epoch": 2 }, { "type": "loss", "content": 0.0852648988366127, "timestamp": "2025-09-02 14:17:36.488506", "step": 2477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.542737", "step": 2477, "epoch": 2 }, { "type": "loss", "content": 0.11986123770475388, "timestamp": "2025-09-02 14:17:36.545434", "step": 2478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:36.599857", "step": 2478, "epoch": 2 }, { "type": "loss", "content": 0.18646681308746338, "timestamp": "2025-09-02 14:17:36.602160", "step": 2479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:36.655941", "step": 2479, "epoch": 2 }, { "type": "loss", "content": 0.2221401035785675, "timestamp": "2025-09-02 14:17:36.662196", "step": 2480, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:38.572247", "step": 2480, "epoch": 2 }, { "type": "pplx", "content": 9052.729081723372, "timestamp": "2025-09-02 14:17:38.574585", "step": 2480, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2480", "timestamp": "2025-09-02 14:17:38.964786", "step": 2480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:39.022755", "step": 2480, "epoch": 2 }, { "type": "loss", "content": 0.37989774346351624, "timestamp": "2025-09-02 14:17:39.025115", "step": 2481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.081022", "step": 2481, "epoch": 2 }, { "type": "loss", "content": 0.25237858295440674, "timestamp": "2025-09-02 14:17:39.083230", "step": 2482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:39.136935", "step": 2482, "epoch": 2 }, { "type": "loss", "content": 0.24238801002502441, "timestamp": "2025-09-02 14:17:39.139197", "step": 2483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.192656", "step": 2483, "epoch": 2 }, { "type": "loss", "content": 0.08683894574642181, "timestamp": "2025-09-02 14:17:39.198715", "step": 2484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:39.251318", "step": 2484, "epoch": 2 }, { "type": "loss", "content": 0.15137594938278198, "timestamp": "2025-09-02 14:17:39.253815", "step": 2485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.306551", "step": 2485, "epoch": 2 }, { "type": "loss", "content": 0.12206298112869263, "timestamp": "2025-09-02 14:17:39.308700", "step": 2486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.362149", "step": 2486, "epoch": 2 }, { "type": "loss", "content": 0.11206850409507751, "timestamp": "2025-09-02 14:17:39.364371", "step": 2487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.417708", "step": 2487, "epoch": 2 }, { "type": "loss", "content": 0.20927323400974274, "timestamp": "2025-09-02 14:17:39.423848", "step": 2488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.477727", "step": 2488, "epoch": 2 }, { "type": "loss", "content": 0.130281463265419, "timestamp": "2025-09-02 14:17:39.479990", "step": 2489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.536711", "step": 2489, "epoch": 2 }, { "type": "loss", "content": 0.26080623269081116, "timestamp": "2025-09-02 14:17:39.539157", "step": 2490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:39.601116", "step": 2490, "epoch": 2 }, { "type": "loss", "content": 0.13588303327560425, "timestamp": "2025-09-02 14:17:39.604125", "step": 2491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.661854", "step": 2491, "epoch": 2 }, { "type": "loss", "content": 0.2120768278837204, "timestamp": "2025-09-02 14:17:39.667718", "step": 2492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.720212", "step": 2492, "epoch": 2 }, { "type": "loss", "content": 0.19371789693832397, "timestamp": "2025-09-02 14:17:39.722512", "step": 2493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.775618", "step": 2493, "epoch": 2 }, { "type": "loss", "content": 0.16400423645973206, "timestamp": "2025-09-02 14:17:39.777914", "step": 2494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.830871", "step": 2494, "epoch": 2 }, { "type": "loss", "content": 0.17120707035064697, "timestamp": "2025-09-02 14:17:39.833454", "step": 2495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.888626", "step": 2495, "epoch": 2 }, { "type": "loss", "content": 0.1921118199825287, "timestamp": "2025-09-02 14:17:39.894514", "step": 2496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:39.947141", "step": 2496, "epoch": 2 }, { "type": "loss", "content": 0.1304924488067627, "timestamp": "2025-09-02 14:17:39.949370", "step": 2497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:40.004204", "step": 2497, "epoch": 2 }, { "type": "loss", "content": 0.05452202633023262, "timestamp": "2025-09-02 14:17:40.006522", "step": 2498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:40.060326", "step": 2498, "epoch": 2 }, { "type": "loss", "content": 0.13452935218811035, "timestamp": "2025-09-02 14:17:40.062486", "step": 2499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:40.117786", "step": 2499, "epoch": 2 }, { "type": "loss", "content": 0.4003707766532898, "timestamp": "2025-09-02 14:17:40.123563", "step": 2500, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:42.022187", "step": 2500, "epoch": 2 }, { "type": "pplx", "content": 8632.292879791197, "timestamp": "2025-09-02 14:17:42.024384", "step": 2500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.077823", "step": 2500, "epoch": 2 }, { "type": "loss", "content": 0.14044775068759918, "timestamp": "2025-09-02 14:17:42.080234", "step": 2501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.133947", "step": 2501, "epoch": 2 }, { "type": "loss", "content": 0.17990706861019135, "timestamp": "2025-09-02 14:17:42.136379", "step": 2502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.189938", "step": 2502, "epoch": 2 }, { "type": "loss", "content": 0.12736894190311432, "timestamp": "2025-09-02 14:17:42.192722", "step": 2503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.247042", "step": 2503, "epoch": 2 }, { "type": "loss", "content": 0.12623852491378784, "timestamp": "2025-09-02 14:17:42.253268", "step": 2504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.306291", "step": 2504, "epoch": 2 }, { "type": "loss", "content": 0.3108784258365631, "timestamp": "2025-09-02 14:17:42.308860", "step": 2505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.362031", "step": 2505, "epoch": 2 }, { "type": "loss", "content": 0.1521572470664978, "timestamp": "2025-09-02 14:17:42.364497", "step": 2506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.418017", "step": 2506, "epoch": 2 }, { "type": "loss", "content": 0.2009454220533371, "timestamp": "2025-09-02 14:17:42.420326", "step": 2507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:42.474870", "step": 2507, "epoch": 2 }, { "type": "loss", "content": 0.24019776284694672, "timestamp": "2025-09-02 14:17:42.481264", "step": 2508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:42.534466", "step": 2508, "epoch": 2 }, { "type": "loss", "content": 0.17276625335216522, "timestamp": "2025-09-02 14:17:42.536491", "step": 2509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.590039", "step": 2509, "epoch": 2 }, { "type": "loss", "content": 0.12630638480186462, "timestamp": "2025-09-02 14:17:42.592510", "step": 2510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.645881", "step": 2510, "epoch": 2 }, { "type": "loss", "content": 0.21174366772174835, "timestamp": "2025-09-02 14:17:42.648350", "step": 2511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:42.702021", "step": 2511, "epoch": 2 }, { "type": "loss", "content": 0.08587898313999176, "timestamp": "2025-09-02 14:17:42.708088", "step": 2512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.761206", "step": 2512, "epoch": 2 }, { "type": "loss", "content": 0.09762149304151535, "timestamp": "2025-09-02 14:17:42.763411", "step": 2513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.816146", "step": 2513, "epoch": 2 }, { "type": "loss", "content": 0.33575060963630676, "timestamp": "2025-09-02 14:17:42.818396", "step": 2514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.872200", "step": 2514, "epoch": 2 }, { "type": "loss", "content": 0.20518064498901367, "timestamp": "2025-09-02 14:17:42.874506", "step": 2515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.928363", "step": 2515, "epoch": 2 }, { "type": "loss", "content": 0.1743999421596527, "timestamp": "2025-09-02 14:17:42.934459", "step": 2516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:42.987371", "step": 2516, "epoch": 2 }, { "type": "loss", "content": 0.16661131381988525, "timestamp": "2025-09-02 14:17:42.989611", "step": 2517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:43.043194", "step": 2517, "epoch": 2 }, { "type": "loss", "content": 0.19275438785552979, "timestamp": "2025-09-02 14:17:43.045384", "step": 2518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:43.099025", "step": 2518, "epoch": 2 }, { "type": "loss", "content": 0.31283605098724365, "timestamp": "2025-09-02 14:17:43.101115", "step": 2519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:43.154514", "step": 2519, "epoch": 2 }, { "type": "loss", "content": 0.26567205786705017, "timestamp": "2025-09-02 14:17:43.160512", "step": 2520, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:45.061844", "step": 2520, "epoch": 2 }, { "type": "pplx", "content": 8341.964577227061, "timestamp": "2025-09-02 14:17:45.064203", "step": 2520, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2520", "timestamp": "2025-09-02 14:17:45.455244", "step": 2520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.511233", "step": 2520, "epoch": 2 }, { "type": "loss", "content": 0.1900518536567688, "timestamp": "2025-09-02 14:17:45.513785", "step": 2521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.569212", "step": 2521, "epoch": 2 }, { "type": "loss", "content": 0.1381150633096695, "timestamp": "2025-09-02 14:17:45.571772", "step": 2522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.625511", "step": 2522, "epoch": 2 }, { "type": "loss", "content": 0.15217900276184082, "timestamp": "2025-09-02 14:17:45.628167", "step": 2523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.681410", "step": 2523, "epoch": 2 }, { "type": "loss", "content": 0.23363547027111053, "timestamp": "2025-09-02 14:17:45.687652", "step": 2524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.740879", "step": 2524, "epoch": 2 }, { "type": "loss", "content": 0.11225120723247528, "timestamp": "2025-09-02 14:17:45.743446", "step": 2525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.796352", "step": 2525, "epoch": 2 }, { "type": "loss", "content": 0.21692264080047607, "timestamp": "2025-09-02 14:17:45.798615", "step": 2526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.851878", "step": 2526, "epoch": 2 }, { "type": "loss", "content": 0.07489795982837677, "timestamp": "2025-09-02 14:17:45.854132", "step": 2527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.907345", "step": 2527, "epoch": 2 }, { "type": "loss", "content": 0.24721089005470276, "timestamp": "2025-09-02 14:17:45.913214", "step": 2528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:45.966365", "step": 2528, "epoch": 2 }, { "type": "loss", "content": 0.19774878025054932, "timestamp": "2025-09-02 14:17:45.968902", "step": 2529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.022327", "step": 2529, "epoch": 2 }, { "type": "loss", "content": 0.12301738560199738, "timestamp": "2025-09-02 14:17:46.024727", "step": 2530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.078220", "step": 2530, "epoch": 2 }, { "type": "loss", "content": 0.17293599247932434, "timestamp": "2025-09-02 14:17:46.080388", "step": 2531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.134361", "step": 2531, "epoch": 2 }, { "type": "loss", "content": 0.08924397826194763, "timestamp": "2025-09-02 14:17:46.140408", "step": 2532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.193018", "step": 2532, "epoch": 2 }, { "type": "loss", "content": 0.1502026468515396, "timestamp": "2025-09-02 14:17:46.195339", "step": 2533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.248291", "step": 2533, "epoch": 2 }, { "type": "loss", "content": 0.18962320685386658, "timestamp": "2025-09-02 14:17:46.250602", "step": 2534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:46.303953", "step": 2534, "epoch": 2 }, { "type": "loss", "content": 0.09870468080043793, "timestamp": "2025-09-02 14:17:46.306421", "step": 2535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.359910", "step": 2535, "epoch": 2 }, { "type": "loss", "content": 0.12892135977745056, "timestamp": "2025-09-02 14:17:46.365666", "step": 2536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.418242", "step": 2536, "epoch": 2 }, { "type": "loss", "content": 0.12990272045135498, "timestamp": "2025-09-02 14:17:46.420724", "step": 2537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:46.475316", "step": 2537, "epoch": 2 }, { "type": "loss", "content": 0.13224893808364868, "timestamp": "2025-09-02 14:17:46.477715", "step": 2538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.530738", "step": 2538, "epoch": 2 }, { "type": "loss", "content": 0.23548319935798645, "timestamp": "2025-09-02 14:17:46.532975", "step": 2539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:46.587027", "step": 2539, "epoch": 2 }, { "type": "loss", "content": 0.1911395937204361, "timestamp": "2025-09-02 14:17:46.593044", "step": 2540, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:48.510309", "step": 2540, "epoch": 2 }, { "type": "pplx", "content": 8132.013916100752, "timestamp": "2025-09-02 14:17:48.512483", "step": 2540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.566041", "step": 2540, "epoch": 2 }, { "type": "loss", "content": 0.21514219045639038, "timestamp": "2025-09-02 14:17:48.568748", "step": 2541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.623222", "step": 2541, "epoch": 2 }, { "type": "loss", "content": 0.22464188933372498, "timestamp": "2025-09-02 14:17:48.626018", "step": 2542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.679929", "step": 2542, "epoch": 2 }, { "type": "loss", "content": 0.07775857299566269, "timestamp": "2025-09-02 14:17:48.682710", "step": 2543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.736355", "step": 2543, "epoch": 2 }, { "type": "loss", "content": 0.18201065063476562, "timestamp": "2025-09-02 14:17:48.743202", "step": 2544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:48.797241", "step": 2544, "epoch": 2 }, { "type": "loss", "content": 0.22753669321537018, "timestamp": "2025-09-02 14:17:48.799575", "step": 2545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.853323", "step": 2545, "epoch": 2 }, { "type": "loss", "content": 0.18042731285095215, "timestamp": "2025-09-02 14:17:48.855684", "step": 2546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:48.912557", "step": 2546, "epoch": 2 }, { "type": "loss", "content": 0.21287783980369568, "timestamp": "2025-09-02 14:17:48.914977", "step": 2547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:48.968368", "step": 2547, "epoch": 2 }, { "type": "loss", "content": 0.14059846103191376, "timestamp": "2025-09-02 14:17:48.974706", "step": 2548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:49.027611", "step": 2548, "epoch": 2 }, { "type": "loss", "content": 0.13730695843696594, "timestamp": "2025-09-02 14:17:49.030362", "step": 2549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:49.084218", "step": 2549, "epoch": 2 }, { "type": "loss", "content": 0.19414715468883514, "timestamp": "2025-09-02 14:17:49.086819", "step": 2550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.140530", "step": 2550, "epoch": 2 }, { "type": "loss", "content": 0.17204679548740387, "timestamp": "2025-09-02 14:17:49.142910", "step": 2551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.196675", "step": 2551, "epoch": 2 }, { "type": "loss", "content": 0.12406240403652191, "timestamp": "2025-09-02 14:17:49.202971", "step": 2552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.255894", "step": 2552, "epoch": 2 }, { "type": "loss", "content": 0.20099306106567383, "timestamp": "2025-09-02 14:17:49.258419", "step": 2553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.312495", "step": 2553, "epoch": 2 }, { "type": "loss", "content": 0.3003740608692169, "timestamp": "2025-09-02 14:17:49.315181", "step": 2554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.369639", "step": 2554, "epoch": 2 }, { "type": "loss", "content": 0.20317065715789795, "timestamp": "2025-09-02 14:17:49.371978", "step": 2555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:49.425587", "step": 2555, "epoch": 2 }, { "type": "loss", "content": 0.32985633611679077, "timestamp": "2025-09-02 14:17:49.431801", "step": 2556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:49.485405", "step": 2556, "epoch": 2 }, { "type": "loss", "content": 0.2310764491558075, "timestamp": "2025-09-02 14:17:49.487886", "step": 2557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.541644", "step": 2557, "epoch": 2 }, { "type": "loss", "content": 0.15337935090065002, "timestamp": "2025-09-02 14:17:49.543822", "step": 2558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:49.600051", "step": 2558, "epoch": 2 }, { "type": "loss", "content": 0.21662653982639313, "timestamp": "2025-09-02 14:17:49.602596", "step": 2559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:49.656603", "step": 2559, "epoch": 2 }, { "type": "loss", "content": 0.19359417259693146, "timestamp": "2025-09-02 14:17:49.662803", "step": 2560, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:51.603493", "step": 2560, "epoch": 2 }, { "type": "pplx", "content": 8095.856436284856, "timestamp": "2025-09-02 14:17:51.605707", "step": 2560, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2560", "timestamp": "2025-09-02 14:17:52.168870", "step": 2560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.225210", "step": 2560, "epoch": 2 }, { "type": "loss", "content": 0.2687457203865051, "timestamp": "2025-09-02 14:17:52.227867", "step": 2561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.282332", "step": 2561, "epoch": 2 }, { "type": "loss", "content": 0.19965441524982452, "timestamp": "2025-09-02 14:17:52.285548", "step": 2562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:52.340291", "step": 2562, "epoch": 2 }, { "type": "loss", "content": 0.09909535944461823, "timestamp": "2025-09-02 14:17:52.342962", "step": 2563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.397145", "step": 2563, "epoch": 2 }, { "type": "loss", "content": 0.11296039074659348, "timestamp": "2025-09-02 14:17:52.403501", "step": 2564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.457185", "step": 2564, "epoch": 2 }, { "type": "loss", "content": 0.23613907396793365, "timestamp": "2025-09-02 14:17:52.459605", "step": 2565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.512761", "step": 2565, "epoch": 2 }, { "type": "loss", "content": 0.123189277946949, "timestamp": "2025-09-02 14:17:52.514977", "step": 2566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.568681", "step": 2566, "epoch": 2 }, { "type": "loss", "content": 0.1039569303393364, "timestamp": "2025-09-02 14:17:52.572527", "step": 2567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.626440", "step": 2567, "epoch": 2 }, { "type": "loss", "content": 0.19060775637626648, "timestamp": "2025-09-02 14:17:52.632824", "step": 2568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.685235", "step": 2568, "epoch": 2 }, { "type": "loss", "content": 0.15392157435417175, "timestamp": "2025-09-02 14:17:52.687834", "step": 2569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.741033", "step": 2569, "epoch": 2 }, { "type": "loss", "content": 0.13027465343475342, "timestamp": "2025-09-02 14:17:52.743530", "step": 2570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.796970", "step": 2570, "epoch": 2 }, { "type": "loss", "content": 0.09555923938751221, "timestamp": "2025-09-02 14:17:52.799498", "step": 2571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:52.854215", "step": 2571, "epoch": 2 }, { "type": "loss", "content": 0.17686577141284943, "timestamp": "2025-09-02 14:17:52.860397", "step": 2572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.913350", "step": 2572, "epoch": 2 }, { "type": "loss", "content": 0.3070344030857086, "timestamp": "2025-09-02 14:17:52.915710", "step": 2573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:52.969616", "step": 2573, "epoch": 2 }, { "type": "loss", "content": 0.15076296031475067, "timestamp": "2025-09-02 14:17:52.972062", "step": 2574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:53.026735", "step": 2574, "epoch": 2 }, { "type": "loss", "content": 0.2124018371105194, "timestamp": "2025-09-02 14:17:53.029104", "step": 2575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:53.085769", "step": 2575, "epoch": 2 }, { "type": "loss", "content": 0.20549577474594116, "timestamp": "2025-09-02 14:17:53.091853", "step": 2576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:53.156494", "step": 2576, "epoch": 2 }, { "type": "loss", "content": 0.23058323562145233, "timestamp": "2025-09-02 14:17:53.158745", "step": 2577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:53.211664", "step": 2577, "epoch": 2 }, { "type": "loss", "content": 0.1175355389714241, "timestamp": "2025-09-02 14:17:53.214110", "step": 2578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:53.268407", "step": 2578, "epoch": 2 }, { "type": "loss", "content": 0.2525724470615387, "timestamp": "2025-09-02 14:17:53.270599", "step": 2579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:53.324173", "step": 2579, "epoch": 2 }, { "type": "loss", "content": 0.14307792484760284, "timestamp": "2025-09-02 14:17:53.330558", "step": 2580, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:55.224199", "step": 2580, "epoch": 2 }, { "type": "pplx", "content": 8053.67698650011, "timestamp": "2025-09-02 14:17:55.226490", "step": 2580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:55.280440", "step": 2580, "epoch": 2 }, { "type": "loss", "content": 0.11465591192245483, "timestamp": "2025-09-02 14:17:55.282821", "step": 2581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:55.337775", "step": 2581, "epoch": 2 }, { "type": "loss", "content": 0.12406628578901291, "timestamp": "2025-09-02 14:17:55.340257", "step": 2582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:55.393809", "step": 2582, "epoch": 2 }, { "type": "loss", "content": 0.1727665811777115, "timestamp": "2025-09-02 14:17:55.395934", "step": 2583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.448896", "step": 2583, "epoch": 2 }, { "type": "loss", "content": 0.22390961647033691, "timestamp": "2025-09-02 14:17:55.454576", "step": 2584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.507526", "step": 2584, "epoch": 2 }, { "type": "loss", "content": 0.15938052535057068, "timestamp": "2025-09-02 14:17:55.510523", "step": 2585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:55.564858", "step": 2585, "epoch": 2 }, { "type": "loss", "content": 0.07336515188217163, "timestamp": "2025-09-02 14:17:55.567170", "step": 2586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.621175", "step": 2586, "epoch": 2 }, { "type": "loss", "content": 0.20760884881019592, "timestamp": "2025-09-02 14:17:55.623836", "step": 2587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:55.677508", "step": 2587, "epoch": 2 }, { "type": "loss", "content": 0.20545902848243713, "timestamp": "2025-09-02 14:17:55.683628", "step": 2588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.736871", "step": 2588, "epoch": 2 }, { "type": "loss", "content": 0.18949733674526215, "timestamp": "2025-09-02 14:17:55.739269", "step": 2589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.794702", "step": 2589, "epoch": 2 }, { "type": "loss", "content": 0.16173100471496582, "timestamp": "2025-09-02 14:17:55.796715", "step": 2590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.849875", "step": 2590, "epoch": 2 }, { "type": "loss", "content": 0.1374870389699936, "timestamp": "2025-09-02 14:17:55.851957", "step": 2591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:55.905217", "step": 2591, "epoch": 2 }, { "type": "loss", "content": 0.15166430175304413, "timestamp": "2025-09-02 14:17:55.911414", "step": 2592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:55.965666", "step": 2592, "epoch": 2 }, { "type": "loss", "content": 0.1706613004207611, "timestamp": "2025-09-02 14:17:55.967961", "step": 2593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:56.021603", "step": 2593, "epoch": 2 }, { "type": "loss", "content": 0.11123055964708328, "timestamp": "2025-09-02 14:17:56.023753", "step": 2594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:56.077489", "step": 2594, "epoch": 2 }, { "type": "loss", "content": 0.16125105321407318, "timestamp": "2025-09-02 14:17:56.079598", "step": 2595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:56.132312", "step": 2595, "epoch": 2 }, { "type": "loss", "content": 0.3461070656776428, "timestamp": "2025-09-02 14:17:56.137786", "step": 2596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:56.190515", "step": 2596, "epoch": 2 }, { "type": "loss", "content": 0.1154390498995781, "timestamp": "2025-09-02 14:17:56.192773", "step": 2597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:56.246101", "step": 2597, "epoch": 2 }, { "type": "loss", "content": 0.10620669275522232, "timestamp": "2025-09-02 14:17:56.248177", "step": 2598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:56.302238", "step": 2598, "epoch": 2 }, { "type": "loss", "content": 0.20327620208263397, "timestamp": "2025-09-02 14:17:56.304758", "step": 2599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:56.358097", "step": 2599, "epoch": 2 }, { "type": "loss", "content": 0.14055785536766052, "timestamp": "2025-09-02 14:17:56.364286", "step": 2600, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:17:58.266772", "step": 2600, "epoch": 2 }, { "type": "pplx", "content": 7909.146122143173, "timestamp": "2025-09-02 14:17:58.269242", "step": 2600, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 2600", "timestamp": "2025-09-02 14:17:58.634704", "step": 2600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:58.692814", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.15428882837295532, "timestamp": "2025-09-02 14:17:58.695057", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:58.749275", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.11817608028650284, "timestamp": "2025-09-02 14:17:58.752214", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:58.805940", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.2116047739982605, "timestamp": "2025-09-02 14:17:58.808061", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:58.861063", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.1811337023973465, "timestamp": "2025-09-02 14:17:58.867695", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:58.920281", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.11433756351470947, "timestamp": "2025-09-02 14:17:58.922601", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:58.975721", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.20377878844738007, "timestamp": "2025-09-02 14:17:58.978124", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.033107", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.18749940395355225, "timestamp": "2025-09-02 14:17:59.035480", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.088962", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.20425131916999817, "timestamp": "2025-09-02 14:17:59.094908", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.147190", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.1523555964231491, "timestamp": "2025-09-02 14:17:59.149396", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:59.202775", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.10931018739938736, "timestamp": "2025-09-02 14:17:59.204973", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.260522", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.11336611956357956, "timestamp": "2025-09-02 14:17:59.262919", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:59.316658", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.10513429343700409, "timestamp": "2025-09-02 14:17:59.322865", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.375711", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.12612619996070862, "timestamp": "2025-09-02 14:17:59.378018", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.431591", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.19460365176200867, "timestamp": "2025-09-02 14:17:59.433925", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.487074", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.2768401503562927, "timestamp": "2025-09-02 14:17:59.489427", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:17:59.543299", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.19325435161590576, "timestamp": "2025-09-02 14:17:59.549546", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.605583", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.08769180625677109, "timestamp": "2025-09-02 14:17:59.608159", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:17:59.662235", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.15287744998931885, "timestamp": "2025-09-02 14:17:59.664705", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.718254", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.220246359705925, "timestamp": "2025-09-02 14:17:59.720606", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:17:59.773750", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 0.15402072668075562, "timestamp": "2025-09-02 14:17:59.779840", "step": 2620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:01.691936", "step": 2620, "epoch": 3 }, { "type": "pplx", "content": 7987.378450148866, "timestamp": "2025-09-02 14:18:01.694431", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:01.746638", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 0.1549813151359558, "timestamp": "2025-09-02 14:18:01.748813", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:01.803266", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.11535754799842834, "timestamp": "2025-09-02 14:18:01.805207", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:01.858944", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.18978162109851837, "timestamp": "2025-09-02 14:18:01.860755", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:01.914820", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 0.19519788026809692, "timestamp": "2025-09-02 14:18:01.920875", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:01.984213", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.19358353316783905, "timestamp": "2025-09-02 14:18:01.986228", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.040218", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.179507315158844, "timestamp": "2025-09-02 14:18:02.042537", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.098626", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.2232421636581421, "timestamp": "2025-09-02 14:18:02.100925", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:02.157387", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.20479780435562134, "timestamp": "2025-09-02 14:18:02.162714", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:02.217281", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.14906606078147888, "timestamp": "2025-09-02 14:18:02.219500", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.273933", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.21225106716156006, "timestamp": "2025-09-02 14:18:02.275872", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.328657", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.1388361006975174, "timestamp": "2025-09-02 14:18:02.330816", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.383537", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.13424326479434967, "timestamp": "2025-09-02 14:18:02.389390", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.442212", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.13152605295181274, "timestamp": "2025-09-02 14:18:02.444603", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.498247", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.245570108294487, "timestamp": "2025-09-02 14:18:02.501388", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.555936", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.2710901200771332, "timestamp": "2025-09-02 14:18:02.558293", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:02.611640", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.20152045786380768, "timestamp": "2025-09-02 14:18:02.617835", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.670650", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.1028401330113411, "timestamp": "2025-09-02 14:18:02.672456", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:02.725779", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.22835476696491241, "timestamp": "2025-09-02 14:18:02.728230", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:02.781761", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.21513372659683228, "timestamp": "2025-09-02 14:18:02.784043", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:02.837417", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.06071390584111214, "timestamp": "2025-09-02 14:18:02.843574", "step": 2640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:04.734551", "step": 2640, "epoch": 3 }, { "type": "pplx", "content": 7894.602972901846, "timestamp": "2025-09-02 14:18:04.736903", "step": 2640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2640", "timestamp": "2025-09-02 14:18:05.103710", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.160206", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.16681084036827087, "timestamp": "2025-09-02 14:18:05.162237", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.216514", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.1080818697810173, "timestamp": "2025-09-02 14:18:05.218765", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.272038", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.2221183031797409, "timestamp": "2025-09-02 14:18:05.274239", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:05.327807", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.07688391208648682, "timestamp": "2025-09-02 14:18:05.333743", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.386155", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.15494056046009064, "timestamp": "2025-09-02 14:18:05.388856", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.442070", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.24960359930992126, "timestamp": "2025-09-02 14:18:05.444598", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.497791", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.17253834009170532, "timestamp": "2025-09-02 14:18:05.500094", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.553398", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.06798109412193298, "timestamp": "2025-09-02 14:18:05.559446", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:05.612544", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.09403090178966522, "timestamp": "2025-09-02 14:18:05.615070", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:05.668942", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.34361687302589417, "timestamp": "2025-09-02 14:18:05.671394", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.724680", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.2109575867652893, "timestamp": "2025-09-02 14:18:05.726962", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.780418", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.1616998165845871, "timestamp": "2025-09-02 14:18:05.786099", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:05.838870", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.29546865820884705, "timestamp": "2025-09-02 14:18:05.841944", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:05.894918", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.19168305397033691, "timestamp": "2025-09-02 14:18:05.897193", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:05.950766", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.1701042652130127, "timestamp": "2025-09-02 14:18:05.953172", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:06.006378", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.12833528220653534, "timestamp": "2025-09-02 14:18:06.012538", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:06.065382", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.3127908408641815, "timestamp": "2025-09-02 14:18:06.067533", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:06.120514", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.11508885025978088, "timestamp": "2025-09-02 14:18:06.122509", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:06.175624", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.24563640356063843, "timestamp": "2025-09-02 14:18:06.178035", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:06.230619", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.1982627660036087, "timestamp": "2025-09-02 14:18:06.236526", "step": 2660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:08.148812", "step": 2660, "epoch": 3 }, { "type": "pplx", "content": 7579.786071463196, "timestamp": "2025-09-02 14:18:08.151152", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:08.204341", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.028847189620137215, "timestamp": "2025-09-02 14:18:08.206882", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.261368", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.2725866436958313, "timestamp": "2025-09-02 14:18:08.263598", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:08.317479", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.1018921434879303, "timestamp": "2025-09-02 14:18:08.319705", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.373008", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.12374533712863922, "timestamp": "2025-09-02 14:18:08.379238", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.431427", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.14554862678050995, "timestamp": "2025-09-02 14:18:08.433820", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:08.487138", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.28143951296806335, "timestamp": "2025-09-02 14:18:08.489440", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.542492", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.17953741550445557, "timestamp": "2025-09-02 14:18:08.544942", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:08.599196", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.09111257642507553, "timestamp": "2025-09-02 14:18:08.605308", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:08.660078", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.18949094414710999, "timestamp": "2025-09-02 14:18:08.662403", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:08.715116", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.11787185072898865, "timestamp": "2025-09-02 14:18:08.717738", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.771443", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.23324944078922272, "timestamp": "2025-09-02 14:18:08.773738", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:08.826810", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.15041343867778778, "timestamp": "2025-09-02 14:18:08.832746", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.884800", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.18693388998508453, "timestamp": "2025-09-02 14:18:08.887100", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.940198", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.089780792593956, "timestamp": "2025-09-02 14:18:08.942314", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:08.995095", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.11929464340209961, "timestamp": "2025-09-02 14:18:08.997670", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:09.050961", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.0980224683880806, "timestamp": "2025-09-02 14:18:09.056740", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:09.109929", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.16480903327465057, "timestamp": "2025-09-02 14:18:09.112073", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:09.167200", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.26727738976478577, "timestamp": "2025-09-02 14:18:09.169024", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:09.221945", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.33281630277633667, "timestamp": "2025-09-02 14:18:09.224049", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:09.277526", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.1978529691696167, "timestamp": "2025-09-02 14:18:09.283430", "step": 2680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:11.189464", "step": 2680, "epoch": 3 }, { "type": "pplx", "content": 7231.4803751943355, "timestamp": "2025-09-02 14:18:11.191832", "step": 2680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2680", "timestamp": "2025-09-02 14:18:11.553690", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:11.610430", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.1763889193534851, "timestamp": "2025-09-02 14:18:11.614000", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:11.668820", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.1570979654788971, "timestamp": "2025-09-02 14:18:11.670937", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:11.725492", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.08611602336168289, "timestamp": "2025-09-02 14:18:11.727738", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:11.780896", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.2719368636608124, "timestamp": "2025-09-02 14:18:11.787124", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:11.839307", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 0.13837024569511414, "timestamp": "2025-09-02 14:18:11.841421", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:11.894339", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.19439451396465302, "timestamp": "2025-09-02 14:18:11.896530", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:11.949687", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.17734065651893616, "timestamp": "2025-09-02 14:18:11.952432", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.007181", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.21178095042705536, "timestamp": "2025-09-02 14:18:12.013529", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:12.068140", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 0.15134747326374054, "timestamp": "2025-09-02 14:18:12.070387", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.123406", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.15569238364696503, "timestamp": "2025-09-02 14:18:12.125600", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:12.179523", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.14443965256214142, "timestamp": "2025-09-02 14:18:12.182122", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:12.235690", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.14807632565498352, "timestamp": "2025-09-02 14:18:12.241439", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.294675", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.07497920095920563, "timestamp": "2025-09-02 14:18:12.297150", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.350085", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.17546452581882477, "timestamp": "2025-09-02 14:18:12.352270", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:12.408015", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.1469348520040512, "timestamp": "2025-09-02 14:18:12.411011", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.466025", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.13884606957435608, "timestamp": "2025-09-02 14:18:12.471947", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:12.524848", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.19038167595863342, "timestamp": "2025-09-02 14:18:12.527346", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.580618", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.12595823407173157, "timestamp": "2025-09-02 14:18:12.582633", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.635240", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.189218208193779, "timestamp": "2025-09-02 14:18:12.637439", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:12.690237", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.14650043845176697, "timestamp": "2025-09-02 14:18:12.696271", "step": 2700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:14.637793", "step": 2700, "epoch": 3 }, { "type": "pplx", "content": 7181.017688406329, "timestamp": "2025-09-02 14:18:14.639825", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:14.691921", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.16070817410945892, "timestamp": "2025-09-02 14:18:14.693891", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:14.748042", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 0.11099323630332947, "timestamp": "2025-09-02 14:18:14.750731", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:14.805233", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.10240935534238815, "timestamp": "2025-09-02 14:18:14.807724", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:14.860869", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.07491608709096909, "timestamp": "2025-09-02 14:18:14.866942", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:14.919642", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.3033526837825775, "timestamp": "2025-09-02 14:18:14.921949", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:14.974775", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.12288542836904526, "timestamp": "2025-09-02 14:18:14.977068", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.030860", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.27340826392173767, "timestamp": "2025-09-02 14:18:15.033332", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.086605", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.08569242060184479, "timestamp": "2025-09-02 14:18:15.092554", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:15.145685", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.11479819566011429, "timestamp": "2025-09-02 14:18:15.148134", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.203446", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.17112331092357635, "timestamp": "2025-09-02 14:18:15.205881", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:15.260784", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 0.23592695593833923, "timestamp": "2025-09-02 14:18:15.263221", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.316738", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 0.1617148518562317, "timestamp": "2025-09-02 14:18:15.323122", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.376527", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.2258547693490982, "timestamp": "2025-09-02 14:18:15.379238", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.433273", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.20768502354621887, "timestamp": "2025-09-02 14:18:15.435945", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.489037", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.188169926404953, "timestamp": "2025-09-02 14:18:15.491296", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.545646", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.1810438334941864, "timestamp": "2025-09-02 14:18:15.552060", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:15.605984", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.2244543433189392, "timestamp": "2025-09-02 14:18:15.608024", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.662904", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 0.09628872573375702, "timestamp": "2025-09-02 14:18:15.665440", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:15.718578", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.23768992722034454, "timestamp": "2025-09-02 14:18:15.720615", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:15.774640", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.1408531218767166, "timestamp": "2025-09-02 14:18:15.780421", "step": 2720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:17.679381", "step": 2720, "epoch": 3 }, { "type": "pplx", "content": 7268.376054894939, "timestamp": "2025-09-02 14:18:17.681277", "step": 2720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2720", "timestamp": "2025-09-02 14:18:18.043332", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.099021", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.12744645774364471, "timestamp": "2025-09-02 14:18:18.101125", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.155930", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 0.25581157207489014, "timestamp": "2025-09-02 14:18:18.157974", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.211344", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.0880877897143364, "timestamp": "2025-09-02 14:18:18.213669", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.267217", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.2699764370918274, "timestamp": "2025-09-02 14:18:18.273156", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.326809", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.2020975947380066, "timestamp": "2025-09-02 14:18:18.328929", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.383364", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.13267815113067627, "timestamp": "2025-09-02 14:18:18.385274", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.438468", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.20206794142723083, "timestamp": "2025-09-02 14:18:18.441047", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.495037", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.09598584473133087, "timestamp": "2025-09-02 14:18:18.500751", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.552744", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.24922871589660645, "timestamp": "2025-09-02 14:18:18.555149", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.608218", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.19340427219867706, "timestamp": "2025-09-02 14:18:18.610339", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.663209", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.12210763990879059, "timestamp": "2025-09-02 14:18:18.665200", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.717656", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.11793673783540726, "timestamp": "2025-09-02 14:18:18.723563", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.776020", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.2082826942205429, "timestamp": "2025-09-02 14:18:18.778069", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.830935", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.16641807556152344, "timestamp": "2025-09-02 14:18:18.833180", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:18.888088", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.16592438519001007, "timestamp": "2025-09-02 14:18:18.890132", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:18.942857", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.1758667528629303, "timestamp": "2025-09-02 14:18:18.948549", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:19.000630", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.21254800260066986, "timestamp": "2025-09-02 14:18:19.002651", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:19.055273", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.10479537397623062, "timestamp": "2025-09-02 14:18:19.057577", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:19.110598", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.22482332587242126, "timestamp": "2025-09-02 14:18:19.112777", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:19.165547", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.11885706335306168, "timestamp": "2025-09-02 14:18:19.171253", "step": 2740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:21.052195", "step": 2740, "epoch": 3 }, { "type": "pplx", "content": 7426.487915460732, "timestamp": "2025-09-02 14:18:21.054804", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.106920", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.18674743175506592, "timestamp": "2025-09-02 14:18:21.108985", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:21.162392", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.13637082278728485, "timestamp": "2025-09-02 14:18:21.164582", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.218470", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.15861177444458008, "timestamp": "2025-09-02 14:18:21.220843", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:21.274082", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.16835497319698334, "timestamp": "2025-09-02 14:18:21.279800", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.332914", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.1765958070755005, "timestamp": "2025-09-02 14:18:21.336347", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:21.389385", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.1585616171360016, "timestamp": "2025-09-02 14:18:21.391821", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.444947", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.1739344298839569, "timestamp": "2025-09-02 14:18:21.447121", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:21.500114", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.28074008226394653, "timestamp": "2025-09-02 14:18:21.506525", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:21.559645", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.16790449619293213, "timestamp": "2025-09-02 14:18:21.561467", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.614509", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.038649603724479675, "timestamp": "2025-09-02 14:18:21.616517", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.669527", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.10331413894891739, "timestamp": "2025-09-02 14:18:21.671850", "step": 2751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:21.725091", "step": 2751, "epoch": 3 }, { "type": "loss", "content": 0.1439976841211319, "timestamp": "2025-09-02 14:18:21.730767", "step": 2752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:21.784141", "step": 2752, "epoch": 3 }, { "type": "loss", "content": 0.28463608026504517, "timestamp": "2025-09-02 14:18:21.786462", "step": 2753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.840092", "step": 2753, "epoch": 3 }, { "type": "loss", "content": 0.2504451274871826, "timestamp": "2025-09-02 14:18:21.843859", "step": 2754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.896997", "step": 2754, "epoch": 3 }, { "type": "loss", "content": 0.17817550897598267, "timestamp": "2025-09-02 14:18:21.899260", "step": 2755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:21.952336", "step": 2755, "epoch": 3 }, { "type": "loss", "content": 0.10410673171281815, "timestamp": "2025-09-02 14:18:21.958048", "step": 2756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:22.010233", "step": 2756, "epoch": 3 }, { "type": "loss", "content": 0.2336079627275467, "timestamp": "2025-09-02 14:18:22.012654", "step": 2757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:22.065206", "step": 2757, "epoch": 3 }, { "type": "loss", "content": 0.1443932056427002, "timestamp": "2025-09-02 14:18:22.067162", "step": 2758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:22.119872", "step": 2758, "epoch": 3 }, { "type": "loss", "content": 0.10333919525146484, "timestamp": "2025-09-02 14:18:22.122035", "step": 2759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:22.175625", "step": 2759, "epoch": 3 }, { "type": "loss", "content": 0.13223592936992645, "timestamp": "2025-09-02 14:18:22.181830", "step": 2760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:24.057141", "step": 2760, "epoch": 3 }, { "type": "pplx", "content": 7676.695196335112, "timestamp": "2025-09-02 14:18:24.059518", "step": 2760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2760", "timestamp": "2025-09-02 14:18:24.442056", "step": 2760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.496370", "step": 2760, "epoch": 3 }, { "type": "loss", "content": 0.1491733193397522, "timestamp": "2025-09-02 14:18:24.498675", "step": 2761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:24.552574", "step": 2761, "epoch": 3 }, { "type": "loss", "content": 0.09085489809513092, "timestamp": "2025-09-02 14:18:24.555234", "step": 2762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:24.608932", "step": 2762, "epoch": 3 }, { "type": "loss", "content": 0.18004591763019562, "timestamp": "2025-09-02 14:18:24.611395", "step": 2763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.667618", "step": 2763, "epoch": 3 }, { "type": "loss", "content": 0.1797284632921219, "timestamp": "2025-09-02 14:18:24.673881", "step": 2764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.726024", "step": 2764, "epoch": 3 }, { "type": "loss", "content": 0.1043209657073021, "timestamp": "2025-09-02 14:18:24.728370", "step": 2765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.781308", "step": 2765, "epoch": 3 }, { "type": "loss", "content": 0.08784884959459305, "timestamp": "2025-09-02 14:18:24.783491", "step": 2766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.836560", "step": 2766, "epoch": 3 }, { "type": "loss", "content": 0.11732843518257141, "timestamp": "2025-09-02 14:18:24.838602", "step": 2767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.891368", "step": 2767, "epoch": 3 }, { "type": "loss", "content": 0.16558541357517242, "timestamp": "2025-09-02 14:18:24.897262", "step": 2768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:24.950535", "step": 2768, "epoch": 3 }, { "type": "loss", "content": 0.1388685554265976, "timestamp": "2025-09-02 14:18:24.952822", "step": 2769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.005490", "step": 2769, "epoch": 3 }, { "type": "loss", "content": 0.16912774741649628, "timestamp": "2025-09-02 14:18:25.007906", "step": 2770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.060340", "step": 2770, "epoch": 3 }, { "type": "loss", "content": 0.09055664390325546, "timestamp": "2025-09-02 14:18:25.062626", "step": 2771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.115299", "step": 2771, "epoch": 3 }, { "type": "loss", "content": 0.09922509640455246, "timestamp": "2025-09-02 14:18:25.121421", "step": 2772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.174022", "step": 2772, "epoch": 3 }, { "type": "loss", "content": 0.09220334142446518, "timestamp": "2025-09-02 14:18:25.176258", "step": 2773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:25.229625", "step": 2773, "epoch": 3 }, { "type": "loss", "content": 0.15516644716262817, "timestamp": "2025-09-02 14:18:25.231543", "step": 2774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.284115", "step": 2774, "epoch": 3 }, { "type": "loss", "content": 0.17089885473251343, "timestamp": "2025-09-02 14:18:25.287260", "step": 2775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:25.340289", "step": 2775, "epoch": 3 }, { "type": "loss", "content": 0.18988606333732605, "timestamp": "2025-09-02 14:18:25.345943", "step": 2776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:25.398584", "step": 2776, "epoch": 3 }, { "type": "loss", "content": 0.13546767830848694, "timestamp": "2025-09-02 14:18:25.400739", "step": 2777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.454559", "step": 2777, "epoch": 3 }, { "type": "loss", "content": 0.10835187137126923, "timestamp": "2025-09-02 14:18:25.456881", "step": 2778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.511642", "step": 2778, "epoch": 3 }, { "type": "loss", "content": 0.11412996053695679, "timestamp": "2025-09-02 14:18:25.513958", "step": 2779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:25.568368", "step": 2779, "epoch": 3 }, { "type": "loss", "content": 0.25231581926345825, "timestamp": "2025-09-02 14:18:25.574292", "step": 2780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:27.504897", "step": 2780, "epoch": 3 }, { "type": "pplx", "content": 7712.907468365606, "timestamp": "2025-09-02 14:18:27.507308", "step": 2780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.560308", "step": 2780, "epoch": 3 }, { "type": "loss", "content": 0.11136386543512344, "timestamp": "2025-09-02 14:18:27.562489", "step": 2781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.616047", "step": 2781, "epoch": 3 }, { "type": "loss", "content": 0.17642775177955627, "timestamp": "2025-09-02 14:18:27.618465", "step": 2782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.672556", "step": 2782, "epoch": 3 }, { "type": "loss", "content": 0.11174861341714859, "timestamp": "2025-09-02 14:18:27.674859", "step": 2783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.727847", "step": 2783, "epoch": 3 }, { "type": "loss", "content": 0.15772269666194916, "timestamp": "2025-09-02 14:18:27.734276", "step": 2784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.788389", "step": 2784, "epoch": 3 }, { "type": "loss", "content": 0.36064413189888, "timestamp": "2025-09-02 14:18:27.790624", "step": 2785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:27.843990", "step": 2785, "epoch": 3 }, { "type": "loss", "content": 0.13749048113822937, "timestamp": "2025-09-02 14:18:27.846190", "step": 2786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:27.899603", "step": 2786, "epoch": 3 }, { "type": "loss", "content": 0.14730709791183472, "timestamp": "2025-09-02 14:18:27.901983", "step": 2787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:27.955517", "step": 2787, "epoch": 3 }, { "type": "loss", "content": 0.20212635397911072, "timestamp": "2025-09-02 14:18:27.961533", "step": 2788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.014034", "step": 2788, "epoch": 3 }, { "type": "loss", "content": 0.14411918818950653, "timestamp": "2025-09-02 14:18:28.016051", "step": 2789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.069029", "step": 2789, "epoch": 3 }, { "type": "loss", "content": 0.10930624604225159, "timestamp": "2025-09-02 14:18:28.071401", "step": 2790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.124675", "step": 2790, "epoch": 3 }, { "type": "loss", "content": 0.18040627241134644, "timestamp": "2025-09-02 14:18:28.126874", "step": 2791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.180035", "step": 2791, "epoch": 3 }, { "type": "loss", "content": 0.05545199289917946, "timestamp": "2025-09-02 14:18:28.186144", "step": 2792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.239302", "step": 2792, "epoch": 3 }, { "type": "loss", "content": 0.189777210354805, "timestamp": "2025-09-02 14:18:28.241696", "step": 2793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.295056", "step": 2793, "epoch": 3 }, { "type": "loss", "content": 0.19036906957626343, "timestamp": "2025-09-02 14:18:28.297078", "step": 2794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.350177", "step": 2794, "epoch": 3 }, { "type": "loss", "content": 0.20050141215324402, "timestamp": "2025-09-02 14:18:28.352402", "step": 2795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:28.406031", "step": 2795, "epoch": 3 }, { "type": "loss", "content": 0.15215148031711578, "timestamp": "2025-09-02 14:18:28.412107", "step": 2796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:28.465014", "step": 2796, "epoch": 3 }, { "type": "loss", "content": 0.1385766714811325, "timestamp": "2025-09-02 14:18:28.467306", "step": 2797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.521150", "step": 2797, "epoch": 3 }, { "type": "loss", "content": 0.13451196253299713, "timestamp": "2025-09-02 14:18:28.523471", "step": 2798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.576616", "step": 2798, "epoch": 3 }, { "type": "loss", "content": 0.15631601214408875, "timestamp": "2025-09-02 14:18:28.578846", "step": 2799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:28.631951", "step": 2799, "epoch": 3 }, { "type": "loss", "content": 0.09730863571166992, "timestamp": "2025-09-02 14:18:28.638325", "step": 2800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:30.543287", "step": 2800, "epoch": 3 }, { "type": "pplx", "content": 7696.889181333462, "timestamp": "2025-09-02 14:18:30.545565", "step": 2800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2800", "timestamp": "2025-09-02 14:18:30.900234", "step": 2800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:30.955152", "step": 2800, "epoch": 3 }, { "type": "loss", "content": 0.10570773482322693, "timestamp": "2025-09-02 14:18:30.958004", "step": 2801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.013091", "step": 2801, "epoch": 3 }, { "type": "loss", "content": 0.12156925350427628, "timestamp": "2025-09-02 14:18:31.015237", "step": 2802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.068617", "step": 2802, "epoch": 3 }, { "type": "loss", "content": 0.15518024563789368, "timestamp": "2025-09-02 14:18:31.070953", "step": 2803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.123956", "step": 2803, "epoch": 3 }, { "type": "loss", "content": 0.10396572202444077, "timestamp": "2025-09-02 14:18:31.129783", "step": 2804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:31.182748", "step": 2804, "epoch": 3 }, { "type": "loss", "content": 0.13499999046325684, "timestamp": "2025-09-02 14:18:31.185149", "step": 2805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.238448", "step": 2805, "epoch": 3 }, { "type": "loss", "content": 0.17380154132843018, "timestamp": "2025-09-02 14:18:31.240635", "step": 2806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:31.296410", "step": 2806, "epoch": 3 }, { "type": "loss", "content": 0.14366430044174194, "timestamp": "2025-09-02 14:18:31.298700", "step": 2807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.354629", "step": 2807, "epoch": 3 }, { "type": "loss", "content": 0.23371970653533936, "timestamp": "2025-09-02 14:18:31.360948", "step": 2808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:31.415270", "step": 2808, "epoch": 3 }, { "type": "loss", "content": 0.06520047783851624, "timestamp": "2025-09-02 14:18:31.417582", "step": 2809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.470918", "step": 2809, "epoch": 3 }, { "type": "loss", "content": 0.1663033366203308, "timestamp": "2025-09-02 14:18:31.473308", "step": 2810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.526432", "step": 2810, "epoch": 3 }, { "type": "loss", "content": 0.1768600344657898, "timestamp": "2025-09-02 14:18:31.528714", "step": 2811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:31.582771", "step": 2811, "epoch": 3 }, { "type": "loss", "content": 0.11634551733732224, "timestamp": "2025-09-02 14:18:31.588719", "step": 2812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:31.641832", "step": 2812, "epoch": 3 }, { "type": "loss", "content": 0.1707572638988495, "timestamp": "2025-09-02 14:18:31.644594", "step": 2813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:31.697963", "step": 2813, "epoch": 3 }, { "type": "loss", "content": 0.17952556908130646, "timestamp": "2025-09-02 14:18:31.700173", "step": 2814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:31.755736", "step": 2814, "epoch": 3 }, { "type": "loss", "content": 0.22266003489494324, "timestamp": "2025-09-02 14:18:31.757962", "step": 2815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.810951", "step": 2815, "epoch": 3 }, { "type": "loss", "content": 0.12493284791707993, "timestamp": "2025-09-02 14:18:31.816683", "step": 2816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.869920", "step": 2816, "epoch": 3 }, { "type": "loss", "content": 0.32779866456985474, "timestamp": "2025-09-02 14:18:31.871952", "step": 2817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.925106", "step": 2817, "epoch": 3 }, { "type": "loss", "content": 0.16039349138736725, "timestamp": "2025-09-02 14:18:31.927364", "step": 2818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:31.980282", "step": 2818, "epoch": 3 }, { "type": "loss", "content": 0.16620828211307526, "timestamp": "2025-09-02 14:18:31.982820", "step": 2819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:32.035993", "step": 2819, "epoch": 3 }, { "type": "loss", "content": 0.11189339309930801, "timestamp": "2025-09-02 14:18:32.041765", "step": 2820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:33.946423", "step": 2820, "epoch": 3 }, { "type": "pplx", "content": 7543.850643386526, "timestamp": "2025-09-02 14:18:33.948412", "step": 2820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.002255", "step": 2820, "epoch": 3 }, { "type": "loss", "content": 0.08781614154577255, "timestamp": "2025-09-02 14:18:34.004701", "step": 2821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:34.058510", "step": 2821, "epoch": 3 }, { "type": "loss", "content": 0.20142680406570435, "timestamp": "2025-09-02 14:18:34.060724", "step": 2822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.114866", "step": 2822, "epoch": 3 }, { "type": "loss", "content": 0.2842947244644165, "timestamp": "2025-09-02 14:18:34.117158", "step": 2823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.170597", "step": 2823, "epoch": 3 }, { "type": "loss", "content": 0.24635447561740875, "timestamp": "2025-09-02 14:18:34.176899", "step": 2824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.230074", "step": 2824, "epoch": 3 }, { "type": "loss", "content": 0.16225241124629974, "timestamp": "2025-09-02 14:18:34.232446", "step": 2825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:34.288057", "step": 2825, "epoch": 3 }, { "type": "loss", "content": 0.16380077600479126, "timestamp": "2025-09-02 14:18:34.290263", "step": 2826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.344126", "step": 2826, "epoch": 3 }, { "type": "loss", "content": 0.2009585201740265, "timestamp": "2025-09-02 14:18:34.346722", "step": 2827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.400146", "step": 2827, "epoch": 3 }, { "type": "loss", "content": 0.17205676436424255, "timestamp": "2025-09-02 14:18:34.406353", "step": 2828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.458722", "step": 2828, "epoch": 3 }, { "type": "loss", "content": 0.10968756675720215, "timestamp": "2025-09-02 14:18:34.460835", "step": 2829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.514578", "step": 2829, "epoch": 3 }, { "type": "loss", "content": 0.14923004806041718, "timestamp": "2025-09-02 14:18:34.516940", "step": 2830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.570345", "step": 2830, "epoch": 3 }, { "type": "loss", "content": 0.2126232534646988, "timestamp": "2025-09-02 14:18:34.572499", "step": 2831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:34.625762", "step": 2831, "epoch": 3 }, { "type": "loss", "content": 0.10991066694259644, "timestamp": "2025-09-02 14:18:34.631921", "step": 2832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.685126", "step": 2832, "epoch": 3 }, { "type": "loss", "content": 0.19731342792510986, "timestamp": "2025-09-02 14:18:34.687522", "step": 2833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.740277", "step": 2833, "epoch": 3 }, { "type": "loss", "content": 0.23562292754650116, "timestamp": "2025-09-02 14:18:34.742914", "step": 2834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:34.797713", "step": 2834, "epoch": 3 }, { "type": "loss", "content": 0.2562689781188965, "timestamp": "2025-09-02 14:18:34.800140", "step": 2835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.853934", "step": 2835, "epoch": 3 }, { "type": "loss", "content": 0.20993395149707794, "timestamp": "2025-09-02 14:18:34.860296", "step": 2836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.913065", "step": 2836, "epoch": 3 }, { "type": "loss", "content": 0.14450736343860626, "timestamp": "2025-09-02 14:18:34.915449", "step": 2837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:34.968629", "step": 2837, "epoch": 3 }, { "type": "loss", "content": 0.13913674652576447, "timestamp": "2025-09-02 14:18:34.971323", "step": 2838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:35.025683", "step": 2838, "epoch": 3 }, { "type": "loss", "content": 0.18157443404197693, "timestamp": "2025-09-02 14:18:35.028216", "step": 2839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:35.082608", "step": 2839, "epoch": 3 }, { "type": "loss", "content": 0.13128840923309326, "timestamp": "2025-09-02 14:18:35.089065", "step": 2840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:36.990544", "step": 2840, "epoch": 3 }, { "type": "pplx", "content": 7380.368320648809, "timestamp": "2025-09-02 14:18:36.992704", "step": 2840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2840", "timestamp": "2025-09-02 14:18:37.349604", "step": 2840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.406263", "step": 2840, "epoch": 3 }, { "type": "loss", "content": 0.19442114233970642, "timestamp": "2025-09-02 14:18:37.408937", "step": 2841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.464484", "step": 2841, "epoch": 3 }, { "type": "loss", "content": 0.23683442175388336, "timestamp": "2025-09-02 14:18:37.467370", "step": 2842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:37.522035", "step": 2842, "epoch": 3 }, { "type": "loss", "content": 0.1611441969871521, "timestamp": "2025-09-02 14:18:37.524442", "step": 2843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.577725", "step": 2843, "epoch": 3 }, { "type": "loss", "content": 0.17243310809135437, "timestamp": "2025-09-02 14:18:37.584009", "step": 2844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.636600", "step": 2844, "epoch": 3 }, { "type": "loss", "content": 0.2090591937303543, "timestamp": "2025-09-02 14:18:37.638899", "step": 2845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.693378", "step": 2845, "epoch": 3 }, { "type": "loss", "content": 0.20572443306446075, "timestamp": "2025-09-02 14:18:37.695761", "step": 2846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.749404", "step": 2846, "epoch": 3 }, { "type": "loss", "content": 0.14742393791675568, "timestamp": "2025-09-02 14:18:37.751721", "step": 2847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:37.804867", "step": 2847, "epoch": 3 }, { "type": "loss", "content": 0.1449538618326187, "timestamp": "2025-09-02 14:18:37.811113", "step": 2848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:37.865719", "step": 2848, "epoch": 3 }, { "type": "loss", "content": 0.06785773485898972, "timestamp": "2025-09-02 14:18:37.868405", "step": 2849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:37.923824", "step": 2849, "epoch": 3 }, { "type": "loss", "content": 0.17762243747711182, "timestamp": "2025-09-02 14:18:37.925940", "step": 2850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:37.980716", "step": 2850, "epoch": 3 }, { "type": "loss", "content": 0.2248196303844452, "timestamp": "2025-09-02 14:18:37.983101", "step": 2851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:38.037249", "step": 2851, "epoch": 3 }, { "type": "loss", "content": 0.138749361038208, "timestamp": "2025-09-02 14:18:38.043676", "step": 2852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:38.099421", "step": 2852, "epoch": 3 }, { "type": "loss", "content": 0.2937338948249817, "timestamp": "2025-09-02 14:18:38.101653", "step": 2853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:38.155117", "step": 2853, "epoch": 3 }, { "type": "loss", "content": 0.16941659152507782, "timestamp": "2025-09-02 14:18:38.157728", "step": 2854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:38.212097", "step": 2854, "epoch": 3 }, { "type": "loss", "content": 0.07159364223480225, "timestamp": "2025-09-02 14:18:38.214607", "step": 2855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:38.268159", "step": 2855, "epoch": 3 }, { "type": "loss", "content": 0.2246580570936203, "timestamp": "2025-09-02 14:18:38.274709", "step": 2856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:38.327916", "step": 2856, "epoch": 3 }, { "type": "loss", "content": 0.09784230589866638, "timestamp": "2025-09-02 14:18:38.331091", "step": 2857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:38.384728", "step": 2857, "epoch": 3 }, { "type": "loss", "content": 0.1604565531015396, "timestamp": "2025-09-02 14:18:38.386926", "step": 2858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:38.441931", "step": 2858, "epoch": 3 }, { "type": "loss", "content": 0.13453872501850128, "timestamp": "2025-09-02 14:18:38.444249", "step": 2859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:38.498842", "step": 2859, "epoch": 3 }, { "type": "loss", "content": 0.12313910573720932, "timestamp": "2025-09-02 14:18:38.505014", "step": 2860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:40.425260", "step": 2860, "epoch": 3 }, { "type": "pplx", "content": 7250.999844418418, "timestamp": "2025-09-02 14:18:40.427677", "step": 2860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:40.481445", "step": 2860, "epoch": 3 }, { "type": "loss", "content": 0.19535423815250397, "timestamp": "2025-09-02 14:18:40.483959", "step": 2861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:40.538862", "step": 2861, "epoch": 3 }, { "type": "loss", "content": 0.17713700234889984, "timestamp": "2025-09-02 14:18:40.541328", "step": 2862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.595181", "step": 2862, "epoch": 3 }, { "type": "loss", "content": 0.22779534757137299, "timestamp": "2025-09-02 14:18:40.597725", "step": 2863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.651559", "step": 2863, "epoch": 3 }, { "type": "loss", "content": 0.20703557133674622, "timestamp": "2025-09-02 14:18:40.658159", "step": 2864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.711196", "step": 2864, "epoch": 3 }, { "type": "loss", "content": 0.07275115698575974, "timestamp": "2025-09-02 14:18:40.713911", "step": 2865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.768024", "step": 2865, "epoch": 3 }, { "type": "loss", "content": 0.22490118443965912, "timestamp": "2025-09-02 14:18:40.770848", "step": 2866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.824759", "step": 2866, "epoch": 3 }, { "type": "loss", "content": 0.2798295021057129, "timestamp": "2025-09-02 14:18:40.827154", "step": 2867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.880199", "step": 2867, "epoch": 3 }, { "type": "loss", "content": 0.14075253903865814, "timestamp": "2025-09-02 14:18:40.887915", "step": 2868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.941295", "step": 2868, "epoch": 3 }, { "type": "loss", "content": 0.17027023434638977, "timestamp": "2025-09-02 14:18:40.943891", "step": 2869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:40.997936", "step": 2869, "epoch": 3 }, { "type": "loss", "content": 0.21365948021411896, "timestamp": "2025-09-02 14:18:41.000686", "step": 2870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.054096", "step": 2870, "epoch": 3 }, { "type": "loss", "content": 0.13588398694992065, "timestamp": "2025-09-02 14:18:41.056464", "step": 2871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:41.110728", "step": 2871, "epoch": 3 }, { "type": "loss", "content": 0.21593663096427917, "timestamp": "2025-09-02 14:18:41.117036", "step": 2872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.171799", "step": 2872, "epoch": 3 }, { "type": "loss", "content": 0.10792732983827591, "timestamp": "2025-09-02 14:18:41.174251", "step": 2873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.227459", "step": 2873, "epoch": 3 }, { "type": "loss", "content": 0.16166305541992188, "timestamp": "2025-09-02 14:18:41.230073", "step": 2874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.283322", "step": 2874, "epoch": 3 }, { "type": "loss", "content": 0.10129478573799133, "timestamp": "2025-09-02 14:18:41.285725", "step": 2875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.340313", "step": 2875, "epoch": 3 }, { "type": "loss", "content": 0.18589158356189728, "timestamp": "2025-09-02 14:18:41.346389", "step": 2876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.400406", "step": 2876, "epoch": 3 }, { "type": "loss", "content": 0.14042621850967407, "timestamp": "2025-09-02 14:18:41.402718", "step": 2877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.456056", "step": 2877, "epoch": 3 }, { "type": "loss", "content": 0.09799084067344666, "timestamp": "2025-09-02 14:18:41.458440", "step": 2878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.511924", "step": 2878, "epoch": 3 }, { "type": "loss", "content": 0.262643039226532, "timestamp": "2025-09-02 14:18:41.514223", "step": 2879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:41.567659", "step": 2879, "epoch": 3 }, { "type": "loss", "content": 0.22343215346336365, "timestamp": "2025-09-02 14:18:41.573792", "step": 2880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:43.494297", "step": 2880, "epoch": 3 }, { "type": "pplx", "content": 7250.2158077062795, "timestamp": "2025-09-02 14:18:43.496619", "step": 2880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2880", "timestamp": "2025-09-02 14:18:43.857619", "step": 2880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:43.915842", "step": 2880, "epoch": 3 }, { "type": "loss", "content": 0.13386580348014832, "timestamp": "2025-09-02 14:18:43.918223", "step": 2881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:43.973846", "step": 2881, "epoch": 3 }, { "type": "loss", "content": 0.254265159368515, "timestamp": "2025-09-02 14:18:43.976165", "step": 2882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.030578", "step": 2882, "epoch": 3 }, { "type": "loss", "content": 0.18378815054893494, "timestamp": "2025-09-02 14:18:44.033056", "step": 2883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.086681", "step": 2883, "epoch": 3 }, { "type": "loss", "content": 0.10367362201213837, "timestamp": "2025-09-02 14:18:44.092782", "step": 2884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:44.145584", "step": 2884, "epoch": 3 }, { "type": "loss", "content": 0.1317785233259201, "timestamp": "2025-09-02 14:18:44.147934", "step": 2885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.201353", "step": 2885, "epoch": 3 }, { "type": "loss", "content": 0.2961713969707489, "timestamp": "2025-09-02 14:18:44.203731", "step": 2886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:44.257884", "step": 2886, "epoch": 3 }, { "type": "loss", "content": 0.11648262292146683, "timestamp": "2025-09-02 14:18:44.260471", "step": 2887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.316229", "step": 2887, "epoch": 3 }, { "type": "loss", "content": 0.08821257203817368, "timestamp": "2025-09-02 14:18:44.322613", "step": 2888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.375366", "step": 2888, "epoch": 3 }, { "type": "loss", "content": 0.16476137936115265, "timestamp": "2025-09-02 14:18:44.377978", "step": 2889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.431502", "step": 2889, "epoch": 3 }, { "type": "loss", "content": 0.11156026273965836, "timestamp": "2025-09-02 14:18:44.434058", "step": 2890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:44.488541", "step": 2890, "epoch": 3 }, { "type": "loss", "content": 0.18787460029125214, "timestamp": "2025-09-02 14:18:44.490898", "step": 2891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:44.544255", "step": 2891, "epoch": 3 }, { "type": "loss", "content": 0.0758783146739006, "timestamp": "2025-09-02 14:18:44.550352", "step": 2892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.603735", "step": 2892, "epoch": 3 }, { "type": "loss", "content": 0.31879615783691406, "timestamp": "2025-09-02 14:18:44.606095", "step": 2893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.659798", "step": 2893, "epoch": 3 }, { "type": "loss", "content": 0.22588250041007996, "timestamp": "2025-09-02 14:18:44.662378", "step": 2894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.717150", "step": 2894, "epoch": 3 }, { "type": "loss", "content": 0.2166471928358078, "timestamp": "2025-09-02 14:18:44.719539", "step": 2895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:44.773329", "step": 2895, "epoch": 3 }, { "type": "loss", "content": 0.1418554037809372, "timestamp": "2025-09-02 14:18:44.779663", "step": 2896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:44.832672", "step": 2896, "epoch": 3 }, { "type": "loss", "content": 0.1302245557308197, "timestamp": "2025-09-02 14:18:44.835215", "step": 2897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.888145", "step": 2897, "epoch": 3 }, { "type": "loss", "content": 0.08713299036026001, "timestamp": "2025-09-02 14:18:44.890874", "step": 2898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:44.944256", "step": 2898, "epoch": 3 }, { "type": "loss", "content": 0.18668274581432343, "timestamp": "2025-09-02 14:18:44.946537", "step": 2899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:44.999716", "step": 2899, "epoch": 3 }, { "type": "loss", "content": 0.11456654220819473, "timestamp": "2025-09-02 14:18:45.005809", "step": 2900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:46.909973", "step": 2900, "epoch": 3 }, { "type": "pplx", "content": 7259.520816267462, "timestamp": "2025-09-02 14:18:46.912368", "step": 2900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:46.965733", "step": 2900, "epoch": 3 }, { "type": "loss", "content": 0.21197006106376648, "timestamp": "2025-09-02 14:18:46.968340", "step": 2901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:47.022679", "step": 2901, "epoch": 3 }, { "type": "loss", "content": 0.10357288271188736, "timestamp": "2025-09-02 14:18:47.025112", "step": 2902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.078495", "step": 2902, "epoch": 3 }, { "type": "loss", "content": 0.09345965087413788, "timestamp": "2025-09-02 14:18:47.080831", "step": 2903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.134555", "step": 2903, "epoch": 3 }, { "type": "loss", "content": 0.12369006872177124, "timestamp": "2025-09-02 14:18:47.141412", "step": 2904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.194759", "step": 2904, "epoch": 3 }, { "type": "loss", "content": 0.17778049409389496, "timestamp": "2025-09-02 14:18:47.197171", "step": 2905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.252057", "step": 2905, "epoch": 3 }, { "type": "loss", "content": 0.27677497267723083, "timestamp": "2025-09-02 14:18:47.254275", "step": 2906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.307329", "step": 2906, "epoch": 3 }, { "type": "loss", "content": 0.18485760688781738, "timestamp": "2025-09-02 14:18:47.309827", "step": 2907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.363128", "step": 2907, "epoch": 3 }, { "type": "loss", "content": 0.17999744415283203, "timestamp": "2025-09-02 14:18:47.370080", "step": 2908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.422725", "step": 2908, "epoch": 3 }, { "type": "loss", "content": 0.21151158213615417, "timestamp": "2025-09-02 14:18:47.424908", "step": 2909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.477413", "step": 2909, "epoch": 3 }, { "type": "loss", "content": 0.223786860704422, "timestamp": "2025-09-02 14:18:47.479528", "step": 2910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.532659", "step": 2910, "epoch": 3 }, { "type": "loss", "content": 0.10498029738664627, "timestamp": "2025-09-02 14:18:47.534880", "step": 2911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.587594", "step": 2911, "epoch": 3 }, { "type": "loss", "content": 0.16217100620269775, "timestamp": "2025-09-02 14:18:47.593684", "step": 2912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:47.646496", "step": 2912, "epoch": 3 }, { "type": "loss", "content": 0.10557249933481216, "timestamp": "2025-09-02 14:18:47.649027", "step": 2913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:47.702920", "step": 2913, "epoch": 3 }, { "type": "loss", "content": 0.20791760087013245, "timestamp": "2025-09-02 14:18:47.705321", "step": 2914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:47.759807", "step": 2914, "epoch": 3 }, { "type": "loss", "content": 0.19950950145721436, "timestamp": "2025-09-02 14:18:47.762342", "step": 2915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.818112", "step": 2915, "epoch": 3 }, { "type": "loss", "content": 0.12545478343963623, "timestamp": "2025-09-02 14:18:47.824546", "step": 2916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:47.879500", "step": 2916, "epoch": 3 }, { "type": "loss", "content": 0.07420684397220612, "timestamp": "2025-09-02 14:18:47.882385", "step": 2917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:47.936042", "step": 2917, "epoch": 3 }, { "type": "loss", "content": 0.10156212747097015, "timestamp": "2025-09-02 14:18:47.938747", "step": 2918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:47.993567", "step": 2918, "epoch": 3 }, { "type": "loss", "content": 0.0910925418138504, "timestamp": "2025-09-02 14:18:47.996291", "step": 2919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:48.053721", "step": 2919, "epoch": 3 }, { "type": "loss", "content": 0.18839381635189056, "timestamp": "2025-09-02 14:18:48.060123", "step": 2920, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:49.993936", "step": 2920, "epoch": 3 }, { "type": "pplx", "content": 7090.180246132522, "timestamp": "2025-09-02 14:18:49.997742", "step": 2920, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2920", "timestamp": "2025-09-02 14:18:50.363226", "step": 2920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.419932", "step": 2920, "epoch": 3 }, { "type": "loss", "content": 0.21074463427066803, "timestamp": "2025-09-02 14:18:50.422302", "step": 2921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.478613", "step": 2921, "epoch": 3 }, { "type": "loss", "content": 0.18458816409111023, "timestamp": "2025-09-02 14:18:50.481027", "step": 2922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.535019", "step": 2922, "epoch": 3 }, { "type": "loss", "content": 0.1962638646364212, "timestamp": "2025-09-02 14:18:50.537746", "step": 2923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.592152", "step": 2923, "epoch": 3 }, { "type": "loss", "content": 0.08433087915182114, "timestamp": "2025-09-02 14:18:50.598164", "step": 2924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:50.651034", "step": 2924, "epoch": 3 }, { "type": "loss", "content": 0.23938322067260742, "timestamp": "2025-09-02 14:18:50.654083", "step": 2925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.708832", "step": 2925, "epoch": 3 }, { "type": "loss", "content": 0.18543800711631775, "timestamp": "2025-09-02 14:18:50.711092", "step": 2926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.764780", "step": 2926, "epoch": 3 }, { "type": "loss", "content": 0.1495160460472107, "timestamp": "2025-09-02 14:18:50.767009", "step": 2927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:50.821675", "step": 2927, "epoch": 3 }, { "type": "loss", "content": 0.2908134162425995, "timestamp": "2025-09-02 14:18:50.827910", "step": 2928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.882646", "step": 2928, "epoch": 3 }, { "type": "loss", "content": 0.10030308365821838, "timestamp": "2025-09-02 14:18:50.885021", "step": 2929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.938165", "step": 2929, "epoch": 3 }, { "type": "loss", "content": 0.0974142849445343, "timestamp": "2025-09-02 14:18:50.940384", "step": 2930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:50.993157", "step": 2930, "epoch": 3 }, { "type": "loss", "content": 0.25592729449272156, "timestamp": "2025-09-02 14:18:50.995422", "step": 2931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.048673", "step": 2931, "epoch": 3 }, { "type": "loss", "content": 0.14607760310173035, "timestamp": "2025-09-02 14:18:51.054502", "step": 2932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.106706", "step": 2932, "epoch": 3 }, { "type": "loss", "content": 0.2915804386138916, "timestamp": "2025-09-02 14:18:51.108993", "step": 2933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:51.163941", "step": 2933, "epoch": 3 }, { "type": "loss", "content": 0.11494843661785126, "timestamp": "2025-09-02 14:18:51.166269", "step": 2934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.220063", "step": 2934, "epoch": 3 }, { "type": "loss", "content": 0.2243773639202118, "timestamp": "2025-09-02 14:18:51.222495", "step": 2935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.275652", "step": 2935, "epoch": 3 }, { "type": "loss", "content": 0.24465571343898773, "timestamp": "2025-09-02 14:18:51.281886", "step": 2936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.335472", "step": 2936, "epoch": 3 }, { "type": "loss", "content": 0.04807603359222412, "timestamp": "2025-09-02 14:18:51.337926", "step": 2937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:51.391084", "step": 2937, "epoch": 3 }, { "type": "loss", "content": 0.25113365054130554, "timestamp": "2025-09-02 14:18:51.393408", "step": 2938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.446704", "step": 2938, "epoch": 3 }, { "type": "loss", "content": 0.1349542886018753, "timestamp": "2025-09-02 14:18:51.449006", "step": 2939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:51.502083", "step": 2939, "epoch": 3 }, { "type": "loss", "content": 0.0710565596818924, "timestamp": "2025-09-02 14:18:51.508061", "step": 2940, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:53.401992", "step": 2940, "epoch": 3 }, { "type": "pplx", "content": 6936.326412348843, "timestamp": "2025-09-02 14:18:53.404240", "step": 2940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.458048", "step": 2940, "epoch": 3 }, { "type": "loss", "content": 0.11889351904392242, "timestamp": "2025-09-02 14:18:53.463032", "step": 2941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.516834", "step": 2941, "epoch": 3 }, { "type": "loss", "content": 0.15197423100471497, "timestamp": "2025-09-02 14:18:53.519430", "step": 2942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:53.573650", "step": 2942, "epoch": 3 }, { "type": "loss", "content": 0.12532885372638702, "timestamp": "2025-09-02 14:18:53.576380", "step": 2943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.630784", "step": 2943, "epoch": 3 }, { "type": "loss", "content": 0.19318313896656036, "timestamp": "2025-09-02 14:18:53.636975", "step": 2944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.689492", "step": 2944, "epoch": 3 }, { "type": "loss", "content": 0.18899139761924744, "timestamp": "2025-09-02 14:18:53.691872", "step": 2945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.744616", "step": 2945, "epoch": 3 }, { "type": "loss", "content": 0.11815184354782104, "timestamp": "2025-09-02 14:18:53.746849", "step": 2946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:53.800172", "step": 2946, "epoch": 3 }, { "type": "loss", "content": 0.14944836497306824, "timestamp": "2025-09-02 14:18:53.802700", "step": 2947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.856023", "step": 2947, "epoch": 3 }, { "type": "loss", "content": 0.21121078729629517, "timestamp": "2025-09-02 14:18:53.862137", "step": 2948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:53.914403", "step": 2948, "epoch": 3 }, { "type": "loss", "content": 0.14834213256835938, "timestamp": "2025-09-02 14:18:53.916673", "step": 2949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:53.969927", "step": 2949, "epoch": 3 }, { "type": "loss", "content": 0.2318708598613739, "timestamp": "2025-09-02 14:18:53.972427", "step": 2950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.026212", "step": 2950, "epoch": 3 }, { "type": "loss", "content": 0.07395126670598984, "timestamp": "2025-09-02 14:18:54.029149", "step": 2951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:54.082676", "step": 2951, "epoch": 3 }, { "type": "loss", "content": 0.20573917031288147, "timestamp": "2025-09-02 14:18:54.088916", "step": 2952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:54.143838", "step": 2952, "epoch": 3 }, { "type": "loss", "content": 0.1376255452632904, "timestamp": "2025-09-02 14:18:54.146278", "step": 2953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.199279", "step": 2953, "epoch": 3 }, { "type": "loss", "content": 0.155879944562912, "timestamp": "2025-09-02 14:18:54.201724", "step": 2954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.254872", "step": 2954, "epoch": 3 }, { "type": "loss", "content": 0.09937679022550583, "timestamp": "2025-09-02 14:18:54.257301", "step": 2955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:54.310621", "step": 2955, "epoch": 3 }, { "type": "loss", "content": 0.24622835218906403, "timestamp": "2025-09-02 14:18:54.316688", "step": 2956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.368896", "step": 2956, "epoch": 3 }, { "type": "loss", "content": 0.125514954328537, "timestamp": "2025-09-02 14:18:54.371948", "step": 2957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.425525", "step": 2957, "epoch": 3 }, { "type": "loss", "content": 0.12500633299350739, "timestamp": "2025-09-02 14:18:54.428089", "step": 2958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:54.481321", "step": 2958, "epoch": 3 }, { "type": "loss", "content": 0.13069786131381989, "timestamp": "2025-09-02 14:18:54.483785", "step": 2959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:54.536797", "step": 2959, "epoch": 3 }, { "type": "loss", "content": 0.15435467660427094, "timestamp": "2025-09-02 14:18:54.542890", "step": 2960, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:56.424216", "step": 2960, "epoch": 3 }, { "type": "pplx", "content": 6977.443682104272, "timestamp": "2025-09-02 14:18:56.426539", "step": 2960, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2960", "timestamp": "2025-09-02 14:18:56.836018", "step": 2960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:56.893203", "step": 2960, "epoch": 3 }, { "type": "loss", "content": 0.15388336777687073, "timestamp": "2025-09-02 14:18:56.895739", "step": 2961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:56.950083", "step": 2961, "epoch": 3 }, { "type": "loss", "content": 0.28672730922698975, "timestamp": "2025-09-02 14:18:56.952613", "step": 2962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:57.006514", "step": 2962, "epoch": 3 }, { "type": "loss", "content": 0.3886074423789978, "timestamp": "2025-09-02 14:18:57.009205", "step": 2963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:57.062744", "step": 2963, "epoch": 3 }, { "type": "loss", "content": 0.08437421917915344, "timestamp": "2025-09-02 14:18:57.069161", "step": 2964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.122516", "step": 2964, "epoch": 3 }, { "type": "loss", "content": 0.2899521291255951, "timestamp": "2025-09-02 14:18:57.125362", "step": 2965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:57.180772", "step": 2965, "epoch": 3 }, { "type": "loss", "content": 0.08096746355295181, "timestamp": "2025-09-02 14:18:57.183295", "step": 2966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.237136", "step": 2966, "epoch": 3 }, { "type": "loss", "content": 0.18723858892917633, "timestamp": "2025-09-02 14:18:57.239795", "step": 2967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.293745", "step": 2967, "epoch": 3 }, { "type": "loss", "content": 0.1588168740272522, "timestamp": "2025-09-02 14:18:57.300122", "step": 2968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.353693", "step": 2968, "epoch": 3 }, { "type": "loss", "content": 0.31702297925949097, "timestamp": "2025-09-02 14:18:57.356493", "step": 2969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:57.409814", "step": 2969, "epoch": 3 }, { "type": "loss", "content": 0.05763112008571625, "timestamp": "2025-09-02 14:18:57.412330", "step": 2970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.466250", "step": 2970, "epoch": 3 }, { "type": "loss", "content": 0.3091512620449066, "timestamp": "2025-09-02 14:18:57.468824", "step": 2971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.522324", "step": 2971, "epoch": 3 }, { "type": "loss", "content": 0.12970136106014252, "timestamp": "2025-09-02 14:18:57.528484", "step": 2972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.581154", "step": 2972, "epoch": 3 }, { "type": "loss", "content": 0.0953720435500145, "timestamp": "2025-09-02 14:18:57.583950", "step": 2973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.637319", "step": 2973, "epoch": 3 }, { "type": "loss", "content": 0.237746924161911, "timestamp": "2025-09-02 14:18:57.639761", "step": 2974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:18:57.695337", "step": 2974, "epoch": 3 }, { "type": "loss", "content": 0.16112595796585083, "timestamp": "2025-09-02 14:18:57.697861", "step": 2975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.751664", "step": 2975, "epoch": 3 }, { "type": "loss", "content": 0.14432907104492188, "timestamp": "2025-09-02 14:18:57.757658", "step": 2976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.810469", "step": 2976, "epoch": 3 }, { "type": "loss", "content": 0.0980573445558548, "timestamp": "2025-09-02 14:18:57.812798", "step": 2977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.866239", "step": 2977, "epoch": 3 }, { "type": "loss", "content": 0.1764982044696808, "timestamp": "2025-09-02 14:18:57.868737", "step": 2978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:18:57.922377", "step": 2978, "epoch": 3 }, { "type": "loss", "content": 0.0793195590376854, "timestamp": "2025-09-02 14:18:57.924939", "step": 2979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:18:57.977965", "step": 2979, "epoch": 3 }, { "type": "loss", "content": 0.20119214057922363, "timestamp": "2025-09-02 14:18:57.984485", "step": 2980, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:18:59.949135", "step": 2980, "epoch": 3 }, { "type": "pplx", "content": 7271.938988350362, "timestamp": "2025-09-02 14:18:59.951468", "step": 2980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:00.005379", "step": 2980, "epoch": 3 }, { "type": "loss", "content": 0.09639981389045715, "timestamp": "2025-09-02 14:19:00.007881", "step": 2981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.062813", "step": 2981, "epoch": 3 }, { "type": "loss", "content": 0.16860830783843994, "timestamp": "2025-09-02 14:19:00.064985", "step": 2982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:00.119427", "step": 2982, "epoch": 3 }, { "type": "loss", "content": 0.17348477244377136, "timestamp": "2025-09-02 14:19:00.122559", "step": 2983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.177599", "step": 2983, "epoch": 3 }, { "type": "loss", "content": 0.14150823652744293, "timestamp": "2025-09-02 14:19:00.184209", "step": 2984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.236971", "step": 2984, "epoch": 3 }, { "type": "loss", "content": 0.14428764581680298, "timestamp": "2025-09-02 14:19:00.239640", "step": 2985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:00.293581", "step": 2985, "epoch": 3 }, { "type": "loss", "content": 0.26975885033607483, "timestamp": "2025-09-02 14:19:00.296010", "step": 2986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:00.349583", "step": 2986, "epoch": 3 }, { "type": "loss", "content": 0.16833655536174774, "timestamp": "2025-09-02 14:19:00.352215", "step": 2987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.405674", "step": 2987, "epoch": 3 }, { "type": "loss", "content": 0.17844609916210175, "timestamp": "2025-09-02 14:19:00.412264", "step": 2988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:00.465102", "step": 2988, "epoch": 3 }, { "type": "loss", "content": 0.22674605250358582, "timestamp": "2025-09-02 14:19:00.467479", "step": 2989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.522739", "step": 2989, "epoch": 3 }, { "type": "loss", "content": 0.06752471625804901, "timestamp": "2025-09-02 14:19:00.525402", "step": 2990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:00.579475", "step": 2990, "epoch": 3 }, { "type": "loss", "content": 0.06808966398239136, "timestamp": "2025-09-02 14:19:00.581909", "step": 2991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.635257", "step": 2991, "epoch": 3 }, { "type": "loss", "content": 0.13776971399784088, "timestamp": "2025-09-02 14:19:00.641467", "step": 2992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.695037", "step": 2992, "epoch": 3 }, { "type": "loss", "content": 0.12312278896570206, "timestamp": "2025-09-02 14:19:00.697409", "step": 2993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.759878", "step": 2993, "epoch": 3 }, { "type": "loss", "content": 0.1396709382534027, "timestamp": "2025-09-02 14:19:00.762304", "step": 2994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.817716", "step": 2994, "epoch": 3 }, { "type": "loss", "content": 0.1633550077676773, "timestamp": "2025-09-02 14:19:00.820166", "step": 2995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.876498", "step": 2995, "epoch": 3 }, { "type": "loss", "content": 0.1881536841392517, "timestamp": "2025-09-02 14:19:00.883323", "step": 2996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:00.937553", "step": 2996, "epoch": 3 }, { "type": "loss", "content": 0.2928946912288666, "timestamp": "2025-09-02 14:19:00.940349", "step": 2997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:00.994473", "step": 2997, "epoch": 3 }, { "type": "loss", "content": 0.25773751735687256, "timestamp": "2025-09-02 14:19:00.997146", "step": 2998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:01.050396", "step": 2998, "epoch": 3 }, { "type": "loss", "content": 0.10399225354194641, "timestamp": "2025-09-02 14:19:01.053254", "step": 2999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:01.107174", "step": 2999, "epoch": 3 }, { "type": "loss", "content": 0.19767288863658905, "timestamp": "2025-09-02 14:19:01.113474", "step": 3000, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:03.020719", "step": 3000, "epoch": 3 }, { "type": "pplx", "content": 7482.679949340198, "timestamp": "2025-09-02 14:19:03.022963", "step": 3000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3000", "timestamp": "2025-09-02 14:19:03.382599", "step": 3000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.437125", "step": 3000, "epoch": 3 }, { "type": "loss", "content": 0.16495169699192047, "timestamp": "2025-09-02 14:19:03.440160", "step": 3001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.495525", "step": 3001, "epoch": 3 }, { "type": "loss", "content": 0.21235069632530212, "timestamp": "2025-09-02 14:19:03.498245", "step": 3002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.551785", "step": 3002, "epoch": 3 }, { "type": "loss", "content": 0.08702453225851059, "timestamp": "2025-09-02 14:19:03.554754", "step": 3003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:03.610066", "step": 3003, "epoch": 3 }, { "type": "loss", "content": 0.2134970873594284, "timestamp": "2025-09-02 14:19:03.616595", "step": 3004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.669825", "step": 3004, "epoch": 3 }, { "type": "loss", "content": 0.2012423723936081, "timestamp": "2025-09-02 14:19:03.672413", "step": 3005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.725208", "step": 3005, "epoch": 3 }, { "type": "loss", "content": 0.16735973954200745, "timestamp": "2025-09-02 14:19:03.728258", "step": 3006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:03.782373", "step": 3006, "epoch": 3 }, { "type": "loss", "content": 0.11409283429384232, "timestamp": "2025-09-02 14:19:03.786683", "step": 3007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:03.842268", "step": 3007, "epoch": 3 }, { "type": "loss", "content": 0.0668964684009552, "timestamp": "2025-09-02 14:19:03.848266", "step": 3008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.902792", "step": 3008, "epoch": 3 }, { "type": "loss", "content": 0.13999401032924652, "timestamp": "2025-09-02 14:19:03.905302", "step": 3009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:03.958836", "step": 3009, "epoch": 3 }, { "type": "loss", "content": 0.24963577091693878, "timestamp": "2025-09-02 14:19:03.961523", "step": 3010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:04.014936", "step": 3010, "epoch": 3 }, { "type": "loss", "content": 0.17135845124721527, "timestamp": "2025-09-02 14:19:04.017729", "step": 3011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.072334", "step": 3011, "epoch": 3 }, { "type": "loss", "content": 0.09586846828460693, "timestamp": "2025-09-02 14:19:04.079617", "step": 3012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.132708", "step": 3012, "epoch": 3 }, { "type": "loss", "content": 0.1929367333650589, "timestamp": "2025-09-02 14:19:04.135366", "step": 3013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.189560", "step": 3013, "epoch": 3 }, { "type": "loss", "content": 0.09684619307518005, "timestamp": "2025-09-02 14:19:04.192143", "step": 3014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.250074", "step": 3014, "epoch": 3 }, { "type": "loss", "content": 0.1387886106967926, "timestamp": "2025-09-02 14:19:04.258778", "step": 3015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.319574", "step": 3015, "epoch": 3 }, { "type": "loss", "content": 0.0482068732380867, "timestamp": "2025-09-02 14:19:04.325619", "step": 3016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:04.379541", "step": 3016, "epoch": 3 }, { "type": "loss", "content": 0.14664120972156525, "timestamp": "2025-09-02 14:19:04.382151", "step": 3017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.435724", "step": 3017, "epoch": 3 }, { "type": "loss", "content": 0.0809343159198761, "timestamp": "2025-09-02 14:19:04.438117", "step": 3018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.491876", "step": 3018, "epoch": 3 }, { "type": "loss", "content": 0.1527862846851349, "timestamp": "2025-09-02 14:19:04.494359", "step": 3019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:04.547391", "step": 3019, "epoch": 3 }, { "type": "loss", "content": 0.24720771610736847, "timestamp": "2025-09-02 14:19:04.553494", "step": 3020, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:06.489435", "step": 3020, "epoch": 3 }, { "type": "pplx", "content": 7670.042656025564, "timestamp": "2025-09-02 14:19:06.491912", "step": 3020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.545417", "step": 3020, "epoch": 3 }, { "type": "loss", "content": 0.11065145581960678, "timestamp": "2025-09-02 14:19:06.547950", "step": 3021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.610385", "step": 3021, "epoch": 3 }, { "type": "loss", "content": 0.20634210109710693, "timestamp": "2025-09-02 14:19:06.613029", "step": 3022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.667031", "step": 3022, "epoch": 3 }, { "type": "loss", "content": 0.1082688421010971, "timestamp": "2025-09-02 14:19:06.669899", "step": 3023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.725427", "step": 3023, "epoch": 3 }, { "type": "loss", "content": 0.17354516685009003, "timestamp": "2025-09-02 14:19:06.731879", "step": 3024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.785377", "step": 3024, "epoch": 3 }, { "type": "loss", "content": 0.13865076005458832, "timestamp": "2025-09-02 14:19:06.790606", "step": 3025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.845701", "step": 3025, "epoch": 3 }, { "type": "loss", "content": 0.23981112241744995, "timestamp": "2025-09-02 14:19:06.848333", "step": 3026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:06.902444", "step": 3026, "epoch": 3 }, { "type": "loss", "content": 0.1686628758907318, "timestamp": "2025-09-02 14:19:06.905475", "step": 3027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:06.959026", "step": 3027, "epoch": 3 }, { "type": "loss", "content": 0.18444247543811798, "timestamp": "2025-09-02 14:19:06.965150", "step": 3028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.018327", "step": 3028, "epoch": 3 }, { "type": "loss", "content": 0.06000528857111931, "timestamp": "2025-09-02 14:19:07.021241", "step": 3029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:07.075622", "step": 3029, "epoch": 3 }, { "type": "loss", "content": 0.19392642378807068, "timestamp": "2025-09-02 14:19:07.078201", "step": 3030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:07.133070", "step": 3030, "epoch": 3 }, { "type": "loss", "content": 0.21900838613510132, "timestamp": "2025-09-02 14:19:07.135574", "step": 3031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.190239", "step": 3031, "epoch": 3 }, { "type": "loss", "content": 0.1265254020690918, "timestamp": "2025-09-02 14:19:07.196945", "step": 3032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.250574", "step": 3032, "epoch": 3 }, { "type": "loss", "content": 0.1486111879348755, "timestamp": "2025-09-02 14:19:07.253481", "step": 3033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.308310", "step": 3033, "epoch": 3 }, { "type": "loss", "content": 0.09946329146623611, "timestamp": "2025-09-02 14:19:07.311297", "step": 3034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:07.366003", "step": 3034, "epoch": 3 }, { "type": "loss", "content": 0.16201353073120117, "timestamp": "2025-09-02 14:19:07.368411", "step": 3035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:07.422864", "step": 3035, "epoch": 3 }, { "type": "loss", "content": 0.1796557903289795, "timestamp": "2025-09-02 14:19:07.429306", "step": 3036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.483385", "step": 3036, "epoch": 3 }, { "type": "loss", "content": 0.19864381849765778, "timestamp": "2025-09-02 14:19:07.485908", "step": 3037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.539236", "step": 3037, "epoch": 3 }, { "type": "loss", "content": 0.06322536617517471, "timestamp": "2025-09-02 14:19:07.542105", "step": 3038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.596959", "step": 3038, "epoch": 3 }, { "type": "loss", "content": 0.1110023558139801, "timestamp": "2025-09-02 14:19:07.599681", "step": 3039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:07.653772", "step": 3039, "epoch": 3 }, { "type": "loss", "content": 0.07791918516159058, "timestamp": "2025-09-02 14:19:07.659792", "step": 3040, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:09.587850", "step": 3040, "epoch": 3 }, { "type": "pplx", "content": 7871.099417100515, "timestamp": "2025-09-02 14:19:09.590590", "step": 3040, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3040", "timestamp": "2025-09-02 14:19:09.956579", "step": 3040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.012766", "step": 3040, "epoch": 3 }, { "type": "loss", "content": 0.2692968249320984, "timestamp": "2025-09-02 14:19:10.015341", "step": 3041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:10.071051", "step": 3041, "epoch": 3 }, { "type": "loss", "content": 0.23547445237636566, "timestamp": "2025-09-02 14:19:10.073349", "step": 3042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.127103", "step": 3042, "epoch": 3 }, { "type": "loss", "content": 0.16474087536334991, "timestamp": "2025-09-02 14:19:10.129981", "step": 3043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.183280", "step": 3043, "epoch": 3 }, { "type": "loss", "content": 0.13515354692935944, "timestamp": "2025-09-02 14:19:10.189685", "step": 3044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:10.242778", "step": 3044, "epoch": 3 }, { "type": "loss", "content": 0.10673167556524277, "timestamp": "2025-09-02 14:19:10.245110", "step": 3045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.298677", "step": 3045, "epoch": 3 }, { "type": "loss", "content": 0.1566457450389862, "timestamp": "2025-09-02 14:19:10.301390", "step": 3046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.356196", "step": 3046, "epoch": 3 }, { "type": "loss", "content": 0.13798503577709198, "timestamp": "2025-09-02 14:19:10.358665", "step": 3047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.413324", "step": 3047, "epoch": 3 }, { "type": "loss", "content": 0.15547268092632294, "timestamp": "2025-09-02 14:19:10.419631", "step": 3048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.472611", "step": 3048, "epoch": 3 }, { "type": "loss", "content": 0.2434980571269989, "timestamp": "2025-09-02 14:19:10.475111", "step": 3049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.528240", "step": 3049, "epoch": 3 }, { "type": "loss", "content": 0.1847556233406067, "timestamp": "2025-09-02 14:19:10.530571", "step": 3050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:10.584247", "step": 3050, "epoch": 3 }, { "type": "loss", "content": 0.22568295896053314, "timestamp": "2025-09-02 14:19:10.586538", "step": 3051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.640098", "step": 3051, "epoch": 3 }, { "type": "loss", "content": 0.11044846475124359, "timestamp": "2025-09-02 14:19:10.646193", "step": 3052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:19:10.699260", "step": 3052, "epoch": 3 }, { "type": "loss", "content": 0.14124901592731476, "timestamp": "2025-09-02 14:19:10.701860", "step": 3053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:10.755461", "step": 3053, "epoch": 3 }, { "type": "loss", "content": 0.13435617089271545, "timestamp": "2025-09-02 14:19:10.757888", "step": 3054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.812668", "step": 3054, "epoch": 3 }, { "type": "loss", "content": 0.24760957062244415, "timestamp": "2025-09-02 14:19:10.815094", "step": 3055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.868079", "step": 3055, "epoch": 3 }, { "type": "loss", "content": 0.44322237372398376, "timestamp": "2025-09-02 14:19:10.874346", "step": 3056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.927662", "step": 3056, "epoch": 3 }, { "type": "loss", "content": 0.1899731606245041, "timestamp": "2025-09-02 14:19:10.930199", "step": 3057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:10.983202", "step": 3057, "epoch": 3 }, { "type": "loss", "content": 0.0773007869720459, "timestamp": "2025-09-02 14:19:10.985572", "step": 3058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:11.039151", "step": 3058, "epoch": 3 }, { "type": "loss", "content": 0.14931358397006989, "timestamp": "2025-09-02 14:19:11.042584", "step": 3059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:11.096750", "step": 3059, "epoch": 3 }, { "type": "loss", "content": 0.12282412499189377, "timestamp": "2025-09-02 14:19:11.102469", "step": 3060, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:13.001787", "step": 3060, "epoch": 3 }, { "type": "pplx", "content": 8033.792284167352, "timestamp": "2025-09-02 14:19:13.004457", "step": 3060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.056913", "step": 3060, "epoch": 3 }, { "type": "loss", "content": 0.10685033351182938, "timestamp": "2025-09-02 14:19:13.059314", "step": 3061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.113340", "step": 3061, "epoch": 3 }, { "type": "loss", "content": 0.09418138116598129, "timestamp": "2025-09-02 14:19:13.116069", "step": 3062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.169472", "step": 3062, "epoch": 3 }, { "type": "loss", "content": 0.2668313980102539, "timestamp": "2025-09-02 14:19:13.172254", "step": 3063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.225691", "step": 3063, "epoch": 3 }, { "type": "loss", "content": 0.07043126225471497, "timestamp": "2025-09-02 14:19:13.232855", "step": 3064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.288121", "step": 3064, "epoch": 3 }, { "type": "loss", "content": 0.08753778785467148, "timestamp": "2025-09-02 14:19:13.290687", "step": 3065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:13.359313", "step": 3065, "epoch": 3 }, { "type": "loss", "content": 0.08007662743330002, "timestamp": "2025-09-02 14:19:13.361695", "step": 3066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.424601", "step": 3066, "epoch": 3 }, { "type": "loss", "content": 0.25370538234710693, "timestamp": "2025-09-02 14:19:13.427410", "step": 3067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.489762", "step": 3067, "epoch": 3 }, { "type": "loss", "content": 0.0911126583814621, "timestamp": "2025-09-02 14:19:13.495895", "step": 3068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.561907", "step": 3068, "epoch": 3 }, { "type": "loss", "content": 0.11416801810264587, "timestamp": "2025-09-02 14:19:13.564465", "step": 3069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.627259", "step": 3069, "epoch": 3 }, { "type": "loss", "content": 0.12282413989305496, "timestamp": "2025-09-02 14:19:13.629872", "step": 3070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:13.689707", "step": 3070, "epoch": 3 }, { "type": "loss", "content": 0.22270867228507996, "timestamp": "2025-09-02 14:19:13.692090", "step": 3071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:13.752313", "step": 3071, "epoch": 3 }, { "type": "loss", "content": 0.08032254874706268, "timestamp": "2025-09-02 14:19:13.758395", "step": 3072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:13.820067", "step": 3072, "epoch": 3 }, { "type": "loss", "content": 0.09881503134965897, "timestamp": "2025-09-02 14:19:13.822665", "step": 3073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:13.882753", "step": 3073, "epoch": 3 }, { "type": "loss", "content": 0.12556219100952148, "timestamp": "2025-09-02 14:19:13.885003", "step": 3074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:13.940348", "step": 3074, "epoch": 3 }, { "type": "loss", "content": 0.05400716885924339, "timestamp": "2025-09-02 14:19:13.943015", "step": 3075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:14.006418", "step": 3075, "epoch": 3 }, { "type": "loss", "content": 0.2652280032634735, "timestamp": "2025-09-02 14:19:14.012598", "step": 3076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:14.072909", "step": 3076, "epoch": 3 }, { "type": "loss", "content": 0.07901933044195175, "timestamp": "2025-09-02 14:19:14.075311", "step": 3077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:14.140442", "step": 3077, "epoch": 3 }, { "type": "loss", "content": 0.21990124881267548, "timestamp": "2025-09-02 14:19:14.143322", "step": 3078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:14.215557", "step": 3078, "epoch": 3 }, { "type": "loss", "content": 0.12530691921710968, "timestamp": "2025-09-02 14:19:14.218462", "step": 3079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:14.279613", "step": 3079, "epoch": 3 }, { "type": "loss", "content": 0.11266493797302246, "timestamp": "2025-09-02 14:19:14.285789", "step": 3080, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:16.809813", "step": 3080, "epoch": 3 }, { "type": "pplx", "content": 8054.979326889581, "timestamp": "2025-09-02 14:19:16.812141", "step": 3080, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3080", "timestamp": "2025-09-02 14:19:17.187585", "step": 3080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:17.244642", "step": 3080, "epoch": 3 }, { "type": "loss", "content": 0.14308318495750427, "timestamp": "2025-09-02 14:19:17.247249", "step": 3081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.307345", "step": 3081, "epoch": 3 }, { "type": "loss", "content": 0.1806902140378952, "timestamp": "2025-09-02 14:19:17.310217", "step": 3082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.369170", "step": 3082, "epoch": 3 }, { "type": "loss", "content": 0.08464343845844269, "timestamp": "2025-09-02 14:19:17.371764", "step": 3083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.434671", "step": 3083, "epoch": 3 }, { "type": "loss", "content": 0.0834922045469284, "timestamp": "2025-09-02 14:19:17.440818", "step": 3084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.506726", "step": 3084, "epoch": 3 }, { "type": "loss", "content": 0.15732966363430023, "timestamp": "2025-09-02 14:19:17.509367", "step": 3085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.564318", "step": 3085, "epoch": 3 }, { "type": "loss", "content": 0.20425158739089966, "timestamp": "2025-09-02 14:19:17.567721", "step": 3086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:17.634197", "step": 3086, "epoch": 3 }, { "type": "loss", "content": 0.2379133254289627, "timestamp": "2025-09-02 14:19:17.636772", "step": 3087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.696393", "step": 3087, "epoch": 3 }, { "type": "loss", "content": 0.2425532042980194, "timestamp": "2025-09-02 14:19:17.702531", "step": 3088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.760611", "step": 3088, "epoch": 3 }, { "type": "loss", "content": 0.1222371906042099, "timestamp": "2025-09-02 14:19:17.763319", "step": 3089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.829029", "step": 3089, "epoch": 3 }, { "type": "loss", "content": 0.07547710835933685, "timestamp": "2025-09-02 14:19:17.831542", "step": 3090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.890174", "step": 3090, "epoch": 3 }, { "type": "loss", "content": 0.14233888685703278, "timestamp": "2025-09-02 14:19:17.892690", "step": 3091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:17.947275", "step": 3091, "epoch": 3 }, { "type": "loss", "content": 0.21583354473114014, "timestamp": "2025-09-02 14:19:17.953531", "step": 3092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:18.015522", "step": 3092, "epoch": 3 }, { "type": "loss", "content": 0.1491415947675705, "timestamp": "2025-09-02 14:19:18.018050", "step": 3093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.075672", "step": 3093, "epoch": 3 }, { "type": "loss", "content": 0.08343899995088577, "timestamp": "2025-09-02 14:19:18.078357", "step": 3094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.137979", "step": 3094, "epoch": 3 }, { "type": "loss", "content": 0.1978183090686798, "timestamp": "2025-09-02 14:19:18.140632", "step": 3095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:18.193931", "step": 3095, "epoch": 3 }, { "type": "loss", "content": 0.14746710658073425, "timestamp": "2025-09-02 14:19:18.200217", "step": 3096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.253758", "step": 3096, "epoch": 3 }, { "type": "loss", "content": 0.1842956691980362, "timestamp": "2025-09-02 14:19:18.257052", "step": 3097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.319747", "step": 3097, "epoch": 3 }, { "type": "loss", "content": 0.19467328488826752, "timestamp": "2025-09-02 14:19:18.322429", "step": 3098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.383117", "step": 3098, "epoch": 3 }, { "type": "loss", "content": 0.1352662593126297, "timestamp": "2025-09-02 14:19:18.385707", "step": 3099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:18.479313", "step": 3099, "epoch": 3 }, { "type": "loss", "content": 0.2065550535917282, "timestamp": "2025-09-02 14:19:18.485653", "step": 3100, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:21.066611", "step": 3100, "epoch": 3 }, { "type": "pplx", "content": 8159.3429523852465, "timestamp": "2025-09-02 14:19:21.068881", "step": 3100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.121274", "step": 3100, "epoch": 3 }, { "type": "loss", "content": 0.1345558613538742, "timestamp": "2025-09-02 14:19:21.123842", "step": 3101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.177102", "step": 3101, "epoch": 3 }, { "type": "loss", "content": 0.3055059015750885, "timestamp": "2025-09-02 14:19:21.179502", "step": 3102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.238585", "step": 3102, "epoch": 3 }, { "type": "loss", "content": 0.215633824467659, "timestamp": "2025-09-02 14:19:21.241232", "step": 3103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.294449", "step": 3103, "epoch": 3 }, { "type": "loss", "content": 0.12260961532592773, "timestamp": "2025-09-02 14:19:21.300742", "step": 3104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.354891", "step": 3104, "epoch": 3 }, { "type": "loss", "content": 0.1990206241607666, "timestamp": "2025-09-02 14:19:21.357229", "step": 3105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.409993", "step": 3105, "epoch": 3 }, { "type": "loss", "content": 0.20384502410888672, "timestamp": "2025-09-02 14:19:21.412493", "step": 3106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.465608", "step": 3106, "epoch": 3 }, { "type": "loss", "content": 0.1336534470319748, "timestamp": "2025-09-02 14:19:21.468151", "step": 3107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.521005", "step": 3107, "epoch": 3 }, { "type": "loss", "content": 0.3594685196876526, "timestamp": "2025-09-02 14:19:21.527361", "step": 3108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:21.581207", "step": 3108, "epoch": 3 }, { "type": "loss", "content": 0.2895708978176117, "timestamp": "2025-09-02 14:19:21.583711", "step": 3109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.641516", "step": 3109, "epoch": 3 }, { "type": "loss", "content": 0.2314918041229248, "timestamp": "2025-09-02 14:19:21.644477", "step": 3110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.701319", "step": 3110, "epoch": 3 }, { "type": "loss", "content": 0.20713534951210022, "timestamp": "2025-09-02 14:19:21.703886", "step": 3111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.760855", "step": 3111, "epoch": 3 }, { "type": "loss", "content": 0.1314113438129425, "timestamp": "2025-09-02 14:19:21.766882", "step": 3112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.825035", "step": 3112, "epoch": 3 }, { "type": "loss", "content": 0.13390615582466125, "timestamp": "2025-09-02 14:19:21.827600", "step": 3113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.888176", "step": 3113, "epoch": 3 }, { "type": "loss", "content": 0.21080562472343445, "timestamp": "2025-09-02 14:19:21.890708", "step": 3114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:21.949168", "step": 3114, "epoch": 3 }, { "type": "loss", "content": 0.18890239298343658, "timestamp": "2025-09-02 14:19:21.951959", "step": 3115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:22.007190", "step": 3115, "epoch": 3 }, { "type": "loss", "content": 0.08360166102647781, "timestamp": "2025-09-02 14:19:22.013282", "step": 3116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:22.065675", "step": 3116, "epoch": 3 }, { "type": "loss", "content": 0.1758156269788742, "timestamp": "2025-09-02 14:19:22.068062", "step": 3117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:22.120657", "step": 3117, "epoch": 3 }, { "type": "loss", "content": 0.22892482578754425, "timestamp": "2025-09-02 14:19:22.123087", "step": 3118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:22.176084", "step": 3118, "epoch": 3 }, { "type": "loss", "content": 0.1186913400888443, "timestamp": "2025-09-02 14:19:22.178194", "step": 3119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:22.231792", "step": 3119, "epoch": 3 }, { "type": "loss", "content": 0.1417291909456253, "timestamp": "2025-09-02 14:19:22.237742", "step": 3120, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:24.179997", "step": 3120, "epoch": 3 }, { "type": "pplx", "content": 8158.926056003251, "timestamp": "2025-09-02 14:19:24.182397", "step": 3120, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3120", "timestamp": "2025-09-02 14:19:24.548706", "step": 3120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.606702", "step": 3120, "epoch": 3 }, { "type": "loss", "content": 0.18566690385341644, "timestamp": "2025-09-02 14:19:24.609610", "step": 3121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.665095", "step": 3121, "epoch": 3 }, { "type": "loss", "content": 0.19595330953598022, "timestamp": "2025-09-02 14:19:24.667582", "step": 3122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.721929", "step": 3122, "epoch": 3 }, { "type": "loss", "content": 0.09495020657777786, "timestamp": "2025-09-02 14:19:24.724465", "step": 3123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.777927", "step": 3123, "epoch": 3 }, { "type": "loss", "content": 0.10100948065519333, "timestamp": "2025-09-02 14:19:24.784313", "step": 3124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.838087", "step": 3124, "epoch": 3 }, { "type": "loss", "content": 0.157033309340477, "timestamp": "2025-09-02 14:19:24.840593", "step": 3125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:24.894182", "step": 3125, "epoch": 3 }, { "type": "loss", "content": 0.23466850817203522, "timestamp": "2025-09-02 14:19:24.897040", "step": 3126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:24.951155", "step": 3126, "epoch": 3 }, { "type": "loss", "content": 0.270531564950943, "timestamp": "2025-09-02 14:19:24.953731", "step": 3127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:25.007348", "step": 3127, "epoch": 3 }, { "type": "loss", "content": 0.17110812664031982, "timestamp": "2025-09-02 14:19:25.013787", "step": 3128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:25.072676", "step": 3128, "epoch": 3 }, { "type": "loss", "content": 0.10151813179254532, "timestamp": "2025-09-02 14:19:25.075335", "step": 3129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.137527", "step": 3129, "epoch": 3 }, { "type": "loss", "content": 0.12601503729820251, "timestamp": "2025-09-02 14:19:25.140028", "step": 3130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.196172", "step": 3130, "epoch": 3 }, { "type": "loss", "content": 0.1712518036365509, "timestamp": "2025-09-02 14:19:25.198737", "step": 3131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.252825", "step": 3131, "epoch": 3 }, { "type": "loss", "content": 0.1949497014284134, "timestamp": "2025-09-02 14:19:25.259161", "step": 3132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.311275", "step": 3132, "epoch": 3 }, { "type": "loss", "content": 0.20492038130760193, "timestamp": "2025-09-02 14:19:25.313576", "step": 3133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.367734", "step": 3133, "epoch": 3 }, { "type": "loss", "content": 0.08369824290275574, "timestamp": "2025-09-02 14:19:25.370056", "step": 3134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.423302", "step": 3134, "epoch": 3 }, { "type": "loss", "content": 0.07237767428159714, "timestamp": "2025-09-02 14:19:25.425979", "step": 3135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.479111", "step": 3135, "epoch": 3 }, { "type": "loss", "content": 0.1573074907064438, "timestamp": "2025-09-02 14:19:25.485301", "step": 3136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:25.539039", "step": 3136, "epoch": 3 }, { "type": "loss", "content": 0.2270972728729248, "timestamp": "2025-09-02 14:19:25.541939", "step": 3137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:25.595073", "step": 3137, "epoch": 3 }, { "type": "loss", "content": 0.10667001456022263, "timestamp": "2025-09-02 14:19:25.597574", "step": 3138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.650924", "step": 3138, "epoch": 3 }, { "type": "loss", "content": 0.19268891215324402, "timestamp": "2025-09-02 14:19:25.653416", "step": 3139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:25.706501", "step": 3139, "epoch": 3 }, { "type": "loss", "content": 0.10779896378517151, "timestamp": "2025-09-02 14:19:25.713000", "step": 3140, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:27.589320", "step": 3140, "epoch": 3 }, { "type": "pplx", "content": 8225.809618959083, "timestamp": "2025-09-02 14:19:27.592074", "step": 3140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:27.644648", "step": 3140, "epoch": 3 }, { "type": "loss", "content": 0.18423697352409363, "timestamp": "2025-09-02 14:19:27.648020", "step": 3141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:27.702907", "step": 3141, "epoch": 3 }, { "type": "loss", "content": 0.21171481907367706, "timestamp": "2025-09-02 14:19:27.705449", "step": 3142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:27.759195", "step": 3142, "epoch": 3 }, { "type": "loss", "content": 0.12708152830600739, "timestamp": "2025-09-02 14:19:27.761857", "step": 3143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:27.815245", "step": 3143, "epoch": 3 }, { "type": "loss", "content": 0.16236574947834015, "timestamp": "2025-09-02 14:19:27.822080", "step": 3144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:27.875350", "step": 3144, "epoch": 3 }, { "type": "loss", "content": 0.11348918825387955, "timestamp": "2025-09-02 14:19:27.878612", "step": 3145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:27.932612", "step": 3145, "epoch": 3 }, { "type": "loss", "content": 0.15189844369888306, "timestamp": "2025-09-02 14:19:27.935265", "step": 3146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:27.989325", "step": 3146, "epoch": 3 }, { "type": "loss", "content": 0.1743157058954239, "timestamp": "2025-09-02 14:19:27.991402", "step": 3147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:28.044647", "step": 3147, "epoch": 3 }, { "type": "loss", "content": 0.1458682119846344, "timestamp": "2025-09-02 14:19:28.050626", "step": 3148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.104390", "step": 3148, "epoch": 3 }, { "type": "loss", "content": 0.09034041315317154, "timestamp": "2025-09-02 14:19:28.107173", "step": 3149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.160196", "step": 3149, "epoch": 3 }, { "type": "loss", "content": 0.11172758042812347, "timestamp": "2025-09-02 14:19:28.162871", "step": 3150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:28.216399", "step": 3150, "epoch": 3 }, { "type": "loss", "content": 0.15963990986347198, "timestamp": "2025-09-02 14:19:28.219002", "step": 3151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:28.272938", "step": 3151, "epoch": 3 }, { "type": "loss", "content": 0.15355300903320312, "timestamp": "2025-09-02 14:19:28.279061", "step": 3152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.332785", "step": 3152, "epoch": 3 }, { "type": "loss", "content": 0.10186943411827087, "timestamp": "2025-09-02 14:19:28.335184", "step": 3153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.387656", "step": 3153, "epoch": 3 }, { "type": "loss", "content": 0.15238569676876068, "timestamp": "2025-09-02 14:19:28.390169", "step": 3154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.442814", "step": 3154, "epoch": 3 }, { "type": "loss", "content": 0.17233221232891083, "timestamp": "2025-09-02 14:19:28.445096", "step": 3155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.498391", "step": 3155, "epoch": 3 }, { "type": "loss", "content": 0.13907170295715332, "timestamp": "2025-09-02 14:19:28.504529", "step": 3156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.556532", "step": 3156, "epoch": 3 }, { "type": "loss", "content": 0.12545600533485413, "timestamp": "2025-09-02 14:19:28.559132", "step": 3157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:28.613266", "step": 3157, "epoch": 3 }, { "type": "loss", "content": 0.07059308886528015, "timestamp": "2025-09-02 14:19:28.616370", "step": 3158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.670416", "step": 3158, "epoch": 3 }, { "type": "loss", "content": 0.1018519327044487, "timestamp": "2025-09-02 14:19:28.672883", "step": 3159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:28.726024", "step": 3159, "epoch": 3 }, { "type": "loss", "content": 0.1929663121700287, "timestamp": "2025-09-02 14:19:28.732230", "step": 3160, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:30.606931", "step": 3160, "epoch": 3 }, { "type": "pplx", "content": 8337.130304169254, "timestamp": "2025-09-02 14:19:30.609288", "step": 3160, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3160", "timestamp": "2025-09-02 14:19:30.974547", "step": 3160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:31.030871", "step": 3160, "epoch": 3 }, { "type": "loss", "content": 0.29054710268974304, "timestamp": "2025-09-02 14:19:31.033362", "step": 3161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:31.088758", "step": 3161, "epoch": 3 }, { "type": "loss", "content": 0.1484536975622177, "timestamp": "2025-09-02 14:19:31.091782", "step": 3162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.147156", "step": 3162, "epoch": 3 }, { "type": "loss", "content": 0.2779659628868103, "timestamp": "2025-09-02 14:19:31.149876", "step": 3163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:31.203426", "step": 3163, "epoch": 3 }, { "type": "loss", "content": 0.17027126252651215, "timestamp": "2025-09-02 14:19:31.209871", "step": 3164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.263816", "step": 3164, "epoch": 3 }, { "type": "loss", "content": 0.15375156700611115, "timestamp": "2025-09-02 14:19:31.266738", "step": 3165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.320477", "step": 3165, "epoch": 3 }, { "type": "loss", "content": 0.1795056164264679, "timestamp": "2025-09-02 14:19:31.323103", "step": 3166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:31.377177", "step": 3166, "epoch": 3 }, { "type": "loss", "content": 0.05908656120300293, "timestamp": "2025-09-02 14:19:31.379888", "step": 3167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.433287", "step": 3167, "epoch": 3 }, { "type": "loss", "content": 0.08502227813005447, "timestamp": "2025-09-02 14:19:31.439827", "step": 3168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.492837", "step": 3168, "epoch": 3 }, { "type": "loss", "content": 0.25887489318847656, "timestamp": "2025-09-02 14:19:31.495273", "step": 3169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.549555", "step": 3169, "epoch": 3 }, { "type": "loss", "content": 0.1342175453901291, "timestamp": "2025-09-02 14:19:31.552253", "step": 3170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:31.605912", "step": 3170, "epoch": 3 }, { "type": "loss", "content": 0.24536915123462677, "timestamp": "2025-09-02 14:19:31.608158", "step": 3171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.661354", "step": 3171, "epoch": 3 }, { "type": "loss", "content": 0.18191450834274292, "timestamp": "2025-09-02 14:19:31.667771", "step": 3172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.720733", "step": 3172, "epoch": 3 }, { "type": "loss", "content": 0.1429639458656311, "timestamp": "2025-09-02 14:19:31.723361", "step": 3173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.778049", "step": 3173, "epoch": 3 }, { "type": "loss", "content": 0.055122505873441696, "timestamp": "2025-09-02 14:19:31.780245", "step": 3174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.833803", "step": 3174, "epoch": 3 }, { "type": "loss", "content": 0.1811804324388504, "timestamp": "2025-09-02 14:19:31.835834", "step": 3175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:31.889254", "step": 3175, "epoch": 3 }, { "type": "loss", "content": 0.09854148328304291, "timestamp": "2025-09-02 14:19:31.895930", "step": 3176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:31.948823", "step": 3176, "epoch": 3 }, { "type": "loss", "content": 0.22967839241027832, "timestamp": "2025-09-02 14:19:31.951221", "step": 3177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:32.004674", "step": 3177, "epoch": 3 }, { "type": "loss", "content": 0.18386292457580566, "timestamp": "2025-09-02 14:19:32.006827", "step": 3178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:32.060474", "step": 3178, "epoch": 3 }, { "type": "loss", "content": 0.10891995579004288, "timestamp": "2025-09-02 14:19:32.063181", "step": 3179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:32.116717", "step": 3179, "epoch": 3 }, { "type": "loss", "content": 0.36039218306541443, "timestamp": "2025-09-02 14:19:32.122917", "step": 3180, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:34.013760", "step": 3180, "epoch": 3 }, { "type": "pplx", "content": 8498.770083969772, "timestamp": "2025-09-02 14:19:34.016197", "step": 3180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.070436", "step": 3180, "epoch": 3 }, { "type": "loss", "content": 0.14172224700450897, "timestamp": "2025-09-02 14:19:34.073102", "step": 3181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.130075", "step": 3181, "epoch": 3 }, { "type": "loss", "content": 0.16488482058048248, "timestamp": "2025-09-02 14:19:34.132428", "step": 3182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.186325", "step": 3182, "epoch": 3 }, { "type": "loss", "content": 0.20163625478744507, "timestamp": "2025-09-02 14:19:34.188869", "step": 3183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.247353", "step": 3183, "epoch": 3 }, { "type": "loss", "content": 0.17459453642368317, "timestamp": "2025-09-02 14:19:34.253820", "step": 3184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:34.310023", "step": 3184, "epoch": 3 }, { "type": "loss", "content": 0.14379552006721497, "timestamp": "2025-09-02 14:19:34.312904", "step": 3185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:34.367239", "step": 3185, "epoch": 3 }, { "type": "loss", "content": 0.10888668149709702, "timestamp": "2025-09-02 14:19:34.369618", "step": 3186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:34.426528", "step": 3186, "epoch": 3 }, { "type": "loss", "content": 0.1405681073665619, "timestamp": "2025-09-02 14:19:34.428870", "step": 3187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.489154", "step": 3187, "epoch": 3 }, { "type": "loss", "content": 0.1416449099779129, "timestamp": "2025-09-02 14:19:34.495629", "step": 3188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.554134", "step": 3188, "epoch": 3 }, { "type": "loss", "content": 0.31350234150886536, "timestamp": "2025-09-02 14:19:34.556639", "step": 3189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.610128", "step": 3189, "epoch": 3 }, { "type": "loss", "content": 0.17130708694458008, "timestamp": "2025-09-02 14:19:34.612720", "step": 3190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.671254", "step": 3190, "epoch": 3 }, { "type": "loss", "content": 0.07601047307252884, "timestamp": "2025-09-02 14:19:34.673826", "step": 3191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.731523", "step": 3191, "epoch": 3 }, { "type": "loss", "content": 0.09948358684778214, "timestamp": "2025-09-02 14:19:34.737732", "step": 3192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.790880", "step": 3192, "epoch": 3 }, { "type": "loss", "content": 0.20729480683803558, "timestamp": "2025-09-02 14:19:34.793184", "step": 3193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.848230", "step": 3193, "epoch": 3 }, { "type": "loss", "content": 0.160397469997406, "timestamp": "2025-09-02 14:19:34.850860", "step": 3194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:34.904137", "step": 3194, "epoch": 3 }, { "type": "loss", "content": 0.3159931004047394, "timestamp": "2025-09-02 14:19:34.906593", "step": 3195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:34.959928", "step": 3195, "epoch": 3 }, { "type": "loss", "content": 0.19495642185211182, "timestamp": "2025-09-02 14:19:34.965997", "step": 3196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:35.019006", "step": 3196, "epoch": 3 }, { "type": "loss", "content": 0.13204851746559143, "timestamp": "2025-09-02 14:19:35.021366", "step": 3197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:35.074827", "step": 3197, "epoch": 3 }, { "type": "loss", "content": 0.09460259974002838, "timestamp": "2025-09-02 14:19:35.077038", "step": 3198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:35.129195", "step": 3198, "epoch": 3 }, { "type": "loss", "content": 0.09812267124652863, "timestamp": "2025-09-02 14:19:35.131713", "step": 3199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:35.184558", "step": 3199, "epoch": 3 }, { "type": "loss", "content": 0.25956404209136963, "timestamp": "2025-09-02 14:19:35.190541", "step": 3200, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:37.063941", "step": 3200, "epoch": 3 }, { "type": "pplx", "content": 8420.785826326051, "timestamp": "2025-09-02 14:19:37.066314", "step": 3200, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3200", "timestamp": "2025-09-02 14:19:37.433984", "step": 3200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.489415", "step": 3200, "epoch": 3 }, { "type": "loss", "content": 0.1266373097896576, "timestamp": "2025-09-02 14:19:37.491809", "step": 3201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.544938", "step": 3201, "epoch": 3 }, { "type": "loss", "content": 0.15306566655635834, "timestamp": "2025-09-02 14:19:37.547334", "step": 3202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.600534", "step": 3202, "epoch": 3 }, { "type": "loss", "content": 0.06533515453338623, "timestamp": "2025-09-02 14:19:37.603074", "step": 3203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.656088", "step": 3203, "epoch": 3 }, { "type": "loss", "content": 0.10335353016853333, "timestamp": "2025-09-02 14:19:37.662399", "step": 3204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.714633", "step": 3204, "epoch": 3 }, { "type": "loss", "content": 0.15642210841178894, "timestamp": "2025-09-02 14:19:37.717108", "step": 3205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.769905", "step": 3205, "epoch": 3 }, { "type": "loss", "content": 0.27852940559387207, "timestamp": "2025-09-02 14:19:37.772308", "step": 3206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.825416", "step": 3206, "epoch": 3 }, { "type": "loss", "content": 0.12584231793880463, "timestamp": "2025-09-02 14:19:37.827918", "step": 3207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:37.880868", "step": 3207, "epoch": 3 }, { "type": "loss", "content": 0.10581807047128677, "timestamp": "2025-09-02 14:19:37.886965", "step": 3208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:37.940700", "step": 3208, "epoch": 3 }, { "type": "loss", "content": 0.09655214846134186, "timestamp": "2025-09-02 14:19:37.943171", "step": 3209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:37.996383", "step": 3209, "epoch": 3 }, { "type": "loss", "content": 0.14854256808757782, "timestamp": "2025-09-02 14:19:37.998862", "step": 3210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.052549", "step": 3210, "epoch": 3 }, { "type": "loss", "content": 0.09265346080064774, "timestamp": "2025-09-02 14:19:38.055228", "step": 3211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.111932", "step": 3211, "epoch": 3 }, { "type": "loss", "content": 0.11738929897546768, "timestamp": "2025-09-02 14:19:38.118327", "step": 3212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:38.175914", "step": 3212, "epoch": 3 }, { "type": "loss", "content": 0.06532277166843414, "timestamp": "2025-09-02 14:19:38.178536", "step": 3213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.236991", "step": 3213, "epoch": 3 }, { "type": "loss", "content": 0.16640973091125488, "timestamp": "2025-09-02 14:19:38.239407", "step": 3214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.298495", "step": 3214, "epoch": 3 }, { "type": "loss", "content": 0.21504634618759155, "timestamp": "2025-09-02 14:19:38.300862", "step": 3215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.353391", "step": 3215, "epoch": 3 }, { "type": "loss", "content": 0.21120429039001465, "timestamp": "2025-09-02 14:19:38.359291", "step": 3216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:38.411686", "step": 3216, "epoch": 3 }, { "type": "loss", "content": 0.09051873534917831, "timestamp": "2025-09-02 14:19:38.414091", "step": 3217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.479807", "step": 3217, "epoch": 3 }, { "type": "loss", "content": 0.049110449850559235, "timestamp": "2025-09-02 14:19:38.482361", "step": 3218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:38.538407", "step": 3218, "epoch": 3 }, { "type": "loss", "content": 0.1135195717215538, "timestamp": "2025-09-02 14:19:38.540869", "step": 3219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:38.594831", "step": 3219, "epoch": 3 }, { "type": "loss", "content": 0.20036457479000092, "timestamp": "2025-09-02 14:19:38.600947", "step": 3220, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:40.484495", "step": 3220, "epoch": 3 }, { "type": "pplx", "content": 8291.705073897572, "timestamp": "2025-09-02 14:19:40.486777", "step": 3220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.540983", "step": 3220, "epoch": 3 }, { "type": "loss", "content": 0.14278550446033478, "timestamp": "2025-09-02 14:19:40.543728", "step": 3221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.599864", "step": 3221, "epoch": 3 }, { "type": "loss", "content": 0.23079626262187958, "timestamp": "2025-09-02 14:19:40.602324", "step": 3222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.657762", "step": 3222, "epoch": 3 }, { "type": "loss", "content": 0.20747289061546326, "timestamp": "2025-09-02 14:19:40.660218", "step": 3223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.713070", "step": 3223, "epoch": 3 }, { "type": "loss", "content": 0.11061892658472061, "timestamp": "2025-09-02 14:19:40.719573", "step": 3224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:40.772348", "step": 3224, "epoch": 3 }, { "type": "loss", "content": 0.10979148745536804, "timestamp": "2025-09-02 14:19:40.774512", "step": 3225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.828177", "step": 3225, "epoch": 3 }, { "type": "loss", "content": 0.28449729084968567, "timestamp": "2025-09-02 14:19:40.830507", "step": 3226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:40.884877", "step": 3226, "epoch": 3 }, { "type": "loss", "content": 0.10597681999206543, "timestamp": "2025-09-02 14:19:40.887384", "step": 3227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:40.940785", "step": 3227, "epoch": 3 }, { "type": "loss", "content": 0.031918495893478394, "timestamp": "2025-09-02 14:19:40.947285", "step": 3228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.001329", "step": 3228, "epoch": 3 }, { "type": "loss", "content": 0.30362668633461, "timestamp": "2025-09-02 14:19:41.003954", "step": 3229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.057448", "step": 3229, "epoch": 3 }, { "type": "loss", "content": 0.16545793414115906, "timestamp": "2025-09-02 14:19:41.059899", "step": 3230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.113176", "step": 3230, "epoch": 3 }, { "type": "loss", "content": 0.06938117742538452, "timestamp": "2025-09-02 14:19:41.115614", "step": 3231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:41.171835", "step": 3231, "epoch": 3 }, { "type": "loss", "content": 0.12517040967941284, "timestamp": "2025-09-02 14:19:41.178163", "step": 3232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:41.231280", "step": 3232, "epoch": 3 }, { "type": "loss", "content": 0.1147971823811531, "timestamp": "2025-09-02 14:19:41.233619", "step": 3233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.287206", "step": 3233, "epoch": 3 }, { "type": "loss", "content": 0.16310517489910126, "timestamp": "2025-09-02 14:19:41.290380", "step": 3234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.344487", "step": 3234, "epoch": 3 }, { "type": "loss", "content": 0.13684231042861938, "timestamp": "2025-09-02 14:19:41.348166", "step": 3235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.401599", "step": 3235, "epoch": 3 }, { "type": "loss", "content": 0.12773951888084412, "timestamp": "2025-09-02 14:19:41.408360", "step": 3236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.460972", "step": 3236, "epoch": 3 }, { "type": "loss", "content": 0.12409916520118713, "timestamp": "2025-09-02 14:19:41.463702", "step": 3237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.517159", "step": 3237, "epoch": 3 }, { "type": "loss", "content": 0.27354609966278076, "timestamp": "2025-09-02 14:19:41.519761", "step": 3238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.572956", "step": 3238, "epoch": 3 }, { "type": "loss", "content": 0.08420220017433167, "timestamp": "2025-09-02 14:19:41.575654", "step": 3239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:41.630075", "step": 3239, "epoch": 3 }, { "type": "loss", "content": 0.12331055104732513, "timestamp": "2025-09-02 14:19:41.636256", "step": 3240, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:43.537201", "step": 3240, "epoch": 3 }, { "type": "pplx", "content": 8388.601695176398, "timestamp": "2025-09-02 14:19:43.540040", "step": 3240, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3240", "timestamp": "2025-09-02 14:19:44.110833", "step": 3240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:44.169997", "step": 3240, "epoch": 3 }, { "type": "loss", "content": 0.08439157158136368, "timestamp": "2025-09-02 14:19:44.175706", "step": 3241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.233046", "step": 3241, "epoch": 3 }, { "type": "loss", "content": 0.16235171258449554, "timestamp": "2025-09-02 14:19:44.235724", "step": 3242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:44.290973", "step": 3242, "epoch": 3 }, { "type": "loss", "content": 0.17243751883506775, "timestamp": "2025-09-02 14:19:44.294335", "step": 3243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:44.349786", "step": 3243, "epoch": 3 }, { "type": "loss", "content": 0.11323410272598267, "timestamp": "2025-09-02 14:19:44.356604", "step": 3244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.410471", "step": 3244, "epoch": 3 }, { "type": "loss", "content": 0.1172560304403305, "timestamp": "2025-09-02 14:19:44.413179", "step": 3245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.467610", "step": 3245, "epoch": 3 }, { "type": "loss", "content": 0.15582002699375153, "timestamp": "2025-09-02 14:19:44.469986", "step": 3246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.525599", "step": 3246, "epoch": 3 }, { "type": "loss", "content": 0.1638721376657486, "timestamp": "2025-09-02 14:19:44.528042", "step": 3247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.582059", "step": 3247, "epoch": 3 }, { "type": "loss", "content": 0.10222571343183517, "timestamp": "2025-09-02 14:19:44.588581", "step": 3248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.642106", "step": 3248, "epoch": 3 }, { "type": "loss", "content": 0.1503859907388687, "timestamp": "2025-09-02 14:19:44.644789", "step": 3249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:44.698957", "step": 3249, "epoch": 3 }, { "type": "loss", "content": 0.1878579556941986, "timestamp": "2025-09-02 14:19:44.701421", "step": 3250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:44.756205", "step": 3250, "epoch": 3 }, { "type": "loss", "content": 0.13819772005081177, "timestamp": "2025-09-02 14:19:44.759374", "step": 3251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.814469", "step": 3251, "epoch": 3 }, { "type": "loss", "content": 0.2367677390575409, "timestamp": "2025-09-02 14:19:44.821194", "step": 3252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.874611", "step": 3252, "epoch": 3 }, { "type": "loss", "content": 0.10657861828804016, "timestamp": "2025-09-02 14:19:44.877265", "step": 3253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:44.931296", "step": 3253, "epoch": 3 }, { "type": "loss", "content": 0.11462022364139557, "timestamp": "2025-09-02 14:19:44.933826", "step": 3254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:44.988301", "step": 3254, "epoch": 3 }, { "type": "loss", "content": 0.32427921891212463, "timestamp": "2025-09-02 14:19:44.990563", "step": 3255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:45.045523", "step": 3255, "epoch": 3 }, { "type": "loss", "content": 0.1611650586128235, "timestamp": "2025-09-02 14:19:45.052179", "step": 3256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:45.105688", "step": 3256, "epoch": 3 }, { "type": "loss", "content": 0.12155336141586304, "timestamp": "2025-09-02 14:19:45.108058", "step": 3257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:45.163092", "step": 3257, "epoch": 3 }, { "type": "loss", "content": 0.21587704122066498, "timestamp": "2025-09-02 14:19:45.165630", "step": 3258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:45.219758", "step": 3258, "epoch": 3 }, { "type": "loss", "content": 0.11643549054861069, "timestamp": "2025-09-02 14:19:45.222326", "step": 3259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:45.276618", "step": 3259, "epoch": 3 }, { "type": "loss", "content": 0.15780328214168549, "timestamp": "2025-09-02 14:19:45.282895", "step": 3260, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:47.198027", "step": 3260, "epoch": 3 }, { "type": "pplx", "content": 8410.546233848116, "timestamp": "2025-09-02 14:19:47.201085", "step": 3260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.256269", "step": 3260, "epoch": 3 }, { "type": "loss", "content": 0.18527792394161224, "timestamp": "2025-09-02 14:19:47.258704", "step": 3261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:47.313827", "step": 3261, "epoch": 3 }, { "type": "loss", "content": 0.22626258432865143, "timestamp": "2025-09-02 14:19:47.317741", "step": 3262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.372878", "step": 3262, "epoch": 3 }, { "type": "loss", "content": 0.1333838552236557, "timestamp": "2025-09-02 14:19:47.375315", "step": 3263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.429659", "step": 3263, "epoch": 3 }, { "type": "loss", "content": 0.24386608600616455, "timestamp": "2025-09-02 14:19:47.436302", "step": 3264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:47.490400", "step": 3264, "epoch": 3 }, { "type": "loss", "content": 0.13088180124759674, "timestamp": "2025-09-02 14:19:47.492696", "step": 3265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:47.550293", "step": 3265, "epoch": 3 }, { "type": "loss", "content": 0.14386086165905, "timestamp": "2025-09-02 14:19:47.552829", "step": 3266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.607600", "step": 3266, "epoch": 3 }, { "type": "loss", "content": 0.11114614456892014, "timestamp": "2025-09-02 14:19:47.609985", "step": 3267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.664110", "step": 3267, "epoch": 3 }, { "type": "loss", "content": 0.15304824709892273, "timestamp": "2025-09-02 14:19:47.670690", "step": 3268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.724405", "step": 3268, "epoch": 3 }, { "type": "loss", "content": 0.1923639476299286, "timestamp": "2025-09-02 14:19:47.728166", "step": 3269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.782033", "step": 3269, "epoch": 3 }, { "type": "loss", "content": 0.1522635966539383, "timestamp": "2025-09-02 14:19:47.784443", "step": 3270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.838784", "step": 3270, "epoch": 3 }, { "type": "loss", "content": 0.2529730498790741, "timestamp": "2025-09-02 14:19:47.841177", "step": 3271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.895235", "step": 3271, "epoch": 3 }, { "type": "loss", "content": 0.20375974476337433, "timestamp": "2025-09-02 14:19:47.901574", "step": 3272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:47.955038", "step": 3272, "epoch": 3 }, { "type": "loss", "content": 0.16086618602275848, "timestamp": "2025-09-02 14:19:47.957402", "step": 3273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:48.011241", "step": 3273, "epoch": 3 }, { "type": "loss", "content": 0.21704156696796417, "timestamp": "2025-09-02 14:19:48.013706", "step": 3274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:48.067895", "step": 3274, "epoch": 3 }, { "type": "loss", "content": 0.13708806037902832, "timestamp": "2025-09-02 14:19:48.070329", "step": 3275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:48.124703", "step": 3275, "epoch": 3 }, { "type": "loss", "content": 0.10301544517278671, "timestamp": "2025-09-02 14:19:48.131023", "step": 3276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:48.185646", "step": 3276, "epoch": 3 }, { "type": "loss", "content": 0.25300145149230957, "timestamp": "2025-09-02 14:19:48.188044", "step": 3277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:48.242432", "step": 3277, "epoch": 3 }, { "type": "loss", "content": 0.17388705909252167, "timestamp": "2025-09-02 14:19:48.244881", "step": 3278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:48.299150", "step": 3278, "epoch": 3 }, { "type": "loss", "content": 0.09938892722129822, "timestamp": "2025-09-02 14:19:48.301856", "step": 3279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:48.356464", "step": 3279, "epoch": 3 }, { "type": "loss", "content": 0.1299990564584732, "timestamp": "2025-09-02 14:19:48.365121", "step": 3280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:50.265133", "step": 3280, "epoch": 3 }, { "type": "pplx", "content": 8328.34815482715, "timestamp": "2025-09-02 14:19:50.267456", "step": 3280, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3280", "timestamp": "2025-09-02 14:19:50.629254", "step": 3280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:50.686574", "step": 3280, "epoch": 3 }, { "type": "loss", "content": 0.12654010951519012, "timestamp": "2025-09-02 14:19:50.688971", "step": 3281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:50.744366", "step": 3281, "epoch": 3 }, { "type": "loss", "content": 0.15190663933753967, "timestamp": "2025-09-02 14:19:50.746879", "step": 3282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:50.801894", "step": 3282, "epoch": 3 }, { "type": "loss", "content": 0.35592013597488403, "timestamp": "2025-09-02 14:19:50.804187", "step": 3283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:50.858348", "step": 3283, "epoch": 3 }, { "type": "loss", "content": 0.041567184031009674, "timestamp": "2025-09-02 14:19:50.864641", "step": 3284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:50.919086", "step": 3284, "epoch": 3 }, { "type": "loss", "content": 0.07620271295309067, "timestamp": "2025-09-02 14:19:50.921474", "step": 3285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:50.975808", "step": 3285, "epoch": 3 }, { "type": "loss", "content": 0.1589616984128952, "timestamp": "2025-09-02 14:19:50.978296", "step": 3286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.032382", "step": 3286, "epoch": 3 }, { "type": "loss", "content": 0.16588151454925537, "timestamp": "2025-09-02 14:19:51.034877", "step": 3287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.089368", "step": 3287, "epoch": 3 }, { "type": "loss", "content": 0.21832814812660217, "timestamp": "2025-09-02 14:19:51.095830", "step": 3288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.151584", "step": 3288, "epoch": 3 }, { "type": "loss", "content": 0.12054231762886047, "timestamp": "2025-09-02 14:19:51.153734", "step": 3289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.213316", "step": 3289, "epoch": 3 }, { "type": "loss", "content": 0.13232584297657013, "timestamp": "2025-09-02 14:19:51.215683", "step": 3290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.276962", "step": 3290, "epoch": 3 }, { "type": "loss", "content": 0.15174049139022827, "timestamp": "2025-09-02 14:19:51.279193", "step": 3291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:51.341713", "step": 3291, "epoch": 3 }, { "type": "loss", "content": 0.0844954252243042, "timestamp": "2025-09-02 14:19:51.348150", "step": 3292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.409163", "step": 3292, "epoch": 3 }, { "type": "loss", "content": 0.22859284281730652, "timestamp": "2025-09-02 14:19:51.411535", "step": 3293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.471038", "step": 3293, "epoch": 3 }, { "type": "loss", "content": 0.17700466513633728, "timestamp": "2025-09-02 14:19:51.474796", "step": 3294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.529095", "step": 3294, "epoch": 3 }, { "type": "loss", "content": 0.29292190074920654, "timestamp": "2025-09-02 14:19:51.531464", "step": 3295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.584839", "step": 3295, "epoch": 3 }, { "type": "loss", "content": 0.2286795824766159, "timestamp": "2025-09-02 14:19:51.590986", "step": 3296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.643807", "step": 3296, "epoch": 3 }, { "type": "loss", "content": 0.1805548369884491, "timestamp": "2025-09-02 14:19:51.646379", "step": 3297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:51.700581", "step": 3297, "epoch": 3 }, { "type": "loss", "content": 0.1694185733795166, "timestamp": "2025-09-02 14:19:51.703204", "step": 3298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.757079", "step": 3298, "epoch": 3 }, { "type": "loss", "content": 0.17668263614177704, "timestamp": "2025-09-02 14:19:51.759499", "step": 3299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:51.813703", "step": 3299, "epoch": 3 }, { "type": "loss", "content": 0.18067380785942078, "timestamp": "2025-09-02 14:19:51.820212", "step": 3300, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:53.758265", "step": 3300, "epoch": 3 }, { "type": "pplx", "content": 8354.333867762356, "timestamp": "2025-09-02 14:19:53.760812", "step": 3300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:53.814964", "step": 3300, "epoch": 3 }, { "type": "loss", "content": 0.12372546643018723, "timestamp": "2025-09-02 14:19:53.817400", "step": 3301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:53.872078", "step": 3301, "epoch": 3 }, { "type": "loss", "content": 0.07153906673192978, "timestamp": "2025-09-02 14:19:53.874465", "step": 3302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:53.928074", "step": 3302, "epoch": 3 }, { "type": "loss", "content": 0.1743365228176117, "timestamp": "2025-09-02 14:19:53.930418", "step": 3303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:53.984186", "step": 3303, "epoch": 3 }, { "type": "loss", "content": 0.11328288912773132, "timestamp": "2025-09-02 14:19:53.990481", "step": 3304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:54.044099", "step": 3304, "epoch": 3 }, { "type": "loss", "content": 0.08902394771575928, "timestamp": "2025-09-02 14:19:54.046408", "step": 3305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.100208", "step": 3305, "epoch": 3 }, { "type": "loss", "content": 0.07835470139980316, "timestamp": "2025-09-02 14:19:54.102523", "step": 3306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.156519", "step": 3306, "epoch": 3 }, { "type": "loss", "content": 0.24669593572616577, "timestamp": "2025-09-02 14:19:54.158874", "step": 3307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:54.212554", "step": 3307, "epoch": 3 }, { "type": "loss", "content": 0.18969368934631348, "timestamp": "2025-09-02 14:19:54.218813", "step": 3308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.271685", "step": 3308, "epoch": 3 }, { "type": "loss", "content": 0.09857875853776932, "timestamp": "2025-09-02 14:19:54.274107", "step": 3309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:54.327607", "step": 3309, "epoch": 3 }, { "type": "loss", "content": 0.2065911740064621, "timestamp": "2025-09-02 14:19:54.329869", "step": 3310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.382962", "step": 3310, "epoch": 3 }, { "type": "loss", "content": 0.10104765743017197, "timestamp": "2025-09-02 14:19:54.385386", "step": 3311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.441102", "step": 3311, "epoch": 3 }, { "type": "loss", "content": 0.10499391704797745, "timestamp": "2025-09-02 14:19:54.447400", "step": 3312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:54.501212", "step": 3312, "epoch": 3 }, { "type": "loss", "content": 0.11598062515258789, "timestamp": "2025-09-02 14:19:54.503915", "step": 3313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.558188", "step": 3313, "epoch": 3 }, { "type": "loss", "content": 0.09705338627099991, "timestamp": "2025-09-02 14:19:54.560801", "step": 3314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:54.614535", "step": 3314, "epoch": 3 }, { "type": "loss", "content": 0.23528894782066345, "timestamp": "2025-09-02 14:19:54.616961", "step": 3315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.670152", "step": 3315, "epoch": 3 }, { "type": "loss", "content": 0.15852832794189453, "timestamp": "2025-09-02 14:19:54.677365", "step": 3316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.730901", "step": 3316, "epoch": 3 }, { "type": "loss", "content": 0.1207372397184372, "timestamp": "2025-09-02 14:19:54.733337", "step": 3317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:54.787474", "step": 3317, "epoch": 3 }, { "type": "loss", "content": 0.11410140246152878, "timestamp": "2025-09-02 14:19:54.789749", "step": 3318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:54.843991", "step": 3318, "epoch": 3 }, { "type": "loss", "content": 0.20714613795280457, "timestamp": "2025-09-02 14:19:54.846322", "step": 3319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:54.899640", "step": 3319, "epoch": 3 }, { "type": "loss", "content": 0.09052963554859161, "timestamp": "2025-09-02 14:19:54.905730", "step": 3320, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:19:56.785657", "step": 3320, "epoch": 3 }, { "type": "pplx", "content": 8692.206061184721, "timestamp": "2025-09-02 14:19:56.787917", "step": 3320, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3320", "timestamp": "2025-09-02 14:19:57.209077", "step": 3320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.268996", "step": 3320, "epoch": 3 }, { "type": "loss", "content": 0.17174604535102844, "timestamp": "2025-09-02 14:19:57.273536", "step": 3321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.331565", "step": 3321, "epoch": 3 }, { "type": "loss", "content": 0.09998596459627151, "timestamp": "2025-09-02 14:19:57.333634", "step": 3322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.387998", "step": 3322, "epoch": 3 }, { "type": "loss", "content": 0.18134258687496185, "timestamp": "2025-09-02 14:19:57.390195", "step": 3323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:57.444508", "step": 3323, "epoch": 3 }, { "type": "loss", "content": 0.11484185606241226, "timestamp": "2025-09-02 14:19:57.460633", "step": 3324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.520734", "step": 3324, "epoch": 3 }, { "type": "loss", "content": 0.04317670315504074, "timestamp": "2025-09-02 14:19:57.523148", "step": 3325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.577041", "step": 3325, "epoch": 3 }, { "type": "loss", "content": 0.2172226458787918, "timestamp": "2025-09-02 14:19:57.582673", "step": 3326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.642957", "step": 3326, "epoch": 3 }, { "type": "loss", "content": 0.10887590050697327, "timestamp": "2025-09-02 14:19:57.646733", "step": 3327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.700916", "step": 3327, "epoch": 3 }, { "type": "loss", "content": 0.08702421188354492, "timestamp": "2025-09-02 14:19:57.713169", "step": 3328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.767993", "step": 3328, "epoch": 3 }, { "type": "loss", "content": 0.08182681351900101, "timestamp": "2025-09-02 14:19:57.770396", "step": 3329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.823992", "step": 3329, "epoch": 3 }, { "type": "loss", "content": 0.13431459665298462, "timestamp": "2025-09-02 14:19:57.826700", "step": 3330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:57.880580", "step": 3330, "epoch": 3 }, { "type": "loss", "content": 0.11241181939840317, "timestamp": "2025-09-02 14:19:57.884820", "step": 3331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:57.942969", "step": 3331, "epoch": 3 }, { "type": "loss", "content": 0.22024866938591003, "timestamp": "2025-09-02 14:19:57.949255", "step": 3332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:19:58.002960", "step": 3332, "epoch": 3 }, { "type": "loss", "content": 0.19666379690170288, "timestamp": "2025-09-02 14:19:58.006739", "step": 3333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:58.060158", "step": 3333, "epoch": 3 }, { "type": "loss", "content": 0.10926518589258194, "timestamp": "2025-09-02 14:19:58.062795", "step": 3334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:58.124531", "step": 3334, "epoch": 3 }, { "type": "loss", "content": 0.1212100014090538, "timestamp": "2025-09-02 14:19:58.126708", "step": 3335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:58.179820", "step": 3335, "epoch": 3 }, { "type": "loss", "content": 0.1501777619123459, "timestamp": "2025-09-02 14:19:58.186208", "step": 3336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:58.239375", "step": 3336, "epoch": 3 }, { "type": "loss", "content": 0.15973007678985596, "timestamp": "2025-09-02 14:19:58.242915", "step": 3337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:58.297295", "step": 3337, "epoch": 3 }, { "type": "loss", "content": 0.16030409932136536, "timestamp": "2025-09-02 14:19:58.303380", "step": 3338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:19:58.367876", "step": 3338, "epoch": 3 }, { "type": "loss", "content": 0.2189367413520813, "timestamp": "2025-09-02 14:19:58.377293", "step": 3339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:19:58.436472", "step": 3339, "epoch": 3 }, { "type": "loss", "content": 0.13406062126159668, "timestamp": "2025-09-02 14:19:58.442126", "step": 3340, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:00.391298", "step": 3340, "epoch": 3 }, { "type": "pplx", "content": 9204.449043534027, "timestamp": "2025-09-02 14:20:00.393628", "step": 3340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.447706", "step": 3340, "epoch": 3 }, { "type": "loss", "content": 0.07989269495010376, "timestamp": "2025-09-02 14:20:00.449848", "step": 3341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.503492", "step": 3341, "epoch": 3 }, { "type": "loss", "content": 0.0982702448964119, "timestamp": "2025-09-02 14:20:00.505946", "step": 3342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.559446", "step": 3342, "epoch": 3 }, { "type": "loss", "content": 0.16605442762374878, "timestamp": "2025-09-02 14:20:00.561956", "step": 3343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.615091", "step": 3343, "epoch": 3 }, { "type": "loss", "content": 0.12038030475378036, "timestamp": "2025-09-02 14:20:00.621638", "step": 3344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.675026", "step": 3344, "epoch": 3 }, { "type": "loss", "content": 0.14128518104553223, "timestamp": "2025-09-02 14:20:00.677233", "step": 3345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.730872", "step": 3345, "epoch": 3 }, { "type": "loss", "content": 0.1229315996170044, "timestamp": "2025-09-02 14:20:00.733296", "step": 3346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.787372", "step": 3346, "epoch": 3 }, { "type": "loss", "content": 0.13939014077186584, "timestamp": "2025-09-02 14:20:00.789600", "step": 3347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.845095", "step": 3347, "epoch": 3 }, { "type": "loss", "content": 0.25082162022590637, "timestamp": "2025-09-02 14:20:00.850914", "step": 3348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.903274", "step": 3348, "epoch": 3 }, { "type": "loss", "content": 0.0936397910118103, "timestamp": "2025-09-02 14:20:00.905490", "step": 3349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:00.958326", "step": 3349, "epoch": 3 }, { "type": "loss", "content": 0.08632966876029968, "timestamp": "2025-09-02 14:20:00.961841", "step": 3350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:01.017470", "step": 3350, "epoch": 3 }, { "type": "loss", "content": 0.1710757464170456, "timestamp": "2025-09-02 14:20:01.019495", "step": 3351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.074114", "step": 3351, "epoch": 3 }, { "type": "loss", "content": 0.1076897382736206, "timestamp": "2025-09-02 14:20:01.089561", "step": 3352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:01.144739", "step": 3352, "epoch": 3 }, { "type": "loss", "content": 0.23080992698669434, "timestamp": "2025-09-02 14:20:01.147094", "step": 3353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.202098", "step": 3353, "epoch": 3 }, { "type": "loss", "content": 0.11879824101924896, "timestamp": "2025-09-02 14:20:01.204525", "step": 3354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.259840", "step": 3354, "epoch": 3 }, { "type": "loss", "content": 0.12061131745576859, "timestamp": "2025-09-02 14:20:01.262315", "step": 3355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:01.316251", "step": 3355, "epoch": 3 }, { "type": "loss", "content": 0.11093077063560486, "timestamp": "2025-09-02 14:20:01.322725", "step": 3356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.377763", "step": 3356, "epoch": 3 }, { "type": "loss", "content": 0.19253645837306976, "timestamp": "2025-09-02 14:20:01.380014", "step": 3357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.439078", "step": 3357, "epoch": 3 }, { "type": "loss", "content": 0.25513455271720886, "timestamp": "2025-09-02 14:20:01.441640", "step": 3358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:01.496081", "step": 3358, "epoch": 3 }, { "type": "loss", "content": 0.09542723000049591, "timestamp": "2025-09-02 14:20:01.498538", "step": 3359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:01.566912", "step": 3359, "epoch": 3 }, { "type": "loss", "content": 0.09513016790151596, "timestamp": "2025-09-02 14:20:01.572699", "step": 3360, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:04.180711", "step": 3360, "epoch": 3 }, { "type": "pplx", "content": 8958.695001573891, "timestamp": "2025-09-02 14:20:04.182700", "step": 3360, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3360", "timestamp": "2025-09-02 14:20:04.568187", "step": 3360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:04.632099", "step": 3360, "epoch": 3 }, { "type": "loss", "content": 0.12327703088521957, "timestamp": "2025-09-02 14:20:04.634644", "step": 3361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:04.746376", "step": 3361, "epoch": 3 }, { "type": "loss", "content": 0.18400663137435913, "timestamp": "2025-09-02 14:20:04.749063", "step": 3362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:04.826470", "step": 3362, "epoch": 3 }, { "type": "loss", "content": 0.24453476071357727, "timestamp": "2025-09-02 14:20:04.829134", "step": 3363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:04.907431", "step": 3363, "epoch": 3 }, { "type": "loss", "content": 0.10556276887655258, "timestamp": "2025-09-02 14:20:04.914291", "step": 3364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:04.993001", "step": 3364, "epoch": 3 }, { "type": "loss", "content": 0.1250561773777008, "timestamp": "2025-09-02 14:20:05.001039", "step": 3365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.063905", "step": 3365, "epoch": 3 }, { "type": "loss", "content": 0.07182735204696655, "timestamp": "2025-09-02 14:20:05.066308", "step": 3366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.124269", "step": 3366, "epoch": 3 }, { "type": "loss", "content": 0.12873759865760803, "timestamp": "2025-09-02 14:20:05.130062", "step": 3367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.191598", "step": 3367, "epoch": 3 }, { "type": "loss", "content": 0.25273483991622925, "timestamp": "2025-09-02 14:20:05.198517", "step": 3368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.255555", "step": 3368, "epoch": 3 }, { "type": "loss", "content": 0.1890585571527481, "timestamp": "2025-09-02 14:20:05.258627", "step": 3369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.312717", "step": 3369, "epoch": 3 }, { "type": "loss", "content": 0.21333521604537964, "timestamp": "2025-09-02 14:20:05.315644", "step": 3370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.380358", "step": 3370, "epoch": 3 }, { "type": "loss", "content": 0.10951016843318939, "timestamp": "2025-09-02 14:20:05.382824", "step": 3371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.437717", "step": 3371, "epoch": 3 }, { "type": "loss", "content": 0.11308567970991135, "timestamp": "2025-09-02 14:20:05.443887", "step": 3372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.500205", "step": 3372, "epoch": 3 }, { "type": "loss", "content": 0.1417081654071808, "timestamp": "2025-09-02 14:20:05.502539", "step": 3373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:05.556900", "step": 3373, "epoch": 3 }, { "type": "loss", "content": 0.15430279076099396, "timestamp": "2025-09-02 14:20:05.558858", "step": 3374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.612324", "step": 3374, "epoch": 3 }, { "type": "loss", "content": 0.19546008110046387, "timestamp": "2025-09-02 14:20:05.614578", "step": 3375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.672573", "step": 3375, "epoch": 3 }, { "type": "loss", "content": 0.161224365234375, "timestamp": "2025-09-02 14:20:05.678759", "step": 3376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:05.732307", "step": 3376, "epoch": 3 }, { "type": "loss", "content": 0.14055120944976807, "timestamp": "2025-09-02 14:20:05.734629", "step": 3377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.788467", "step": 3377, "epoch": 3 }, { "type": "loss", "content": 0.20917847752571106, "timestamp": "2025-09-02 14:20:05.790821", "step": 3378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:05.851714", "step": 3378, "epoch": 3 }, { "type": "loss", "content": 0.17224110662937164, "timestamp": "2025-09-02 14:20:05.853535", "step": 3379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:05.907848", "step": 3379, "epoch": 3 }, { "type": "loss", "content": 0.19265936315059662, "timestamp": "2025-09-02 14:20:05.916518", "step": 3380, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:07.848404", "step": 3380, "epoch": 3 }, { "type": "pplx", "content": 9004.986125395255, "timestamp": "2025-09-02 14:20:07.852083", "step": 3380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:07.907940", "step": 3380, "epoch": 3 }, { "type": "loss", "content": 0.17580702900886536, "timestamp": "2025-09-02 14:20:07.913023", "step": 3381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:07.970032", "step": 3381, "epoch": 3 }, { "type": "loss", "content": 0.16924065351486206, "timestamp": "2025-09-02 14:20:07.976534", "step": 3382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.033646", "step": 3382, "epoch": 3 }, { "type": "loss", "content": 0.18834248185157776, "timestamp": "2025-09-02 14:20:08.043978", "step": 3383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.108841", "step": 3383, "epoch": 3 }, { "type": "loss", "content": 0.07624297589063644, "timestamp": "2025-09-02 14:20:08.125658", "step": 3384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:08.186487", "step": 3384, "epoch": 3 }, { "type": "loss", "content": 0.05495001748204231, "timestamp": "2025-09-02 14:20:08.192307", "step": 3385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.248935", "step": 3385, "epoch": 3 }, { "type": "loss", "content": 0.10615405440330505, "timestamp": "2025-09-02 14:20:08.264221", "step": 3386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.331275", "step": 3386, "epoch": 3 }, { "type": "loss", "content": 0.14496520161628723, "timestamp": "2025-09-02 14:20:08.336919", "step": 3387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.399044", "step": 3387, "epoch": 3 }, { "type": "loss", "content": 0.1041417345404625, "timestamp": "2025-09-02 14:20:08.409651", "step": 3388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.469613", "step": 3388, "epoch": 3 }, { "type": "loss", "content": 0.09901589900255203, "timestamp": "2025-09-02 14:20:08.476878", "step": 3389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.543591", "step": 3389, "epoch": 3 }, { "type": "loss", "content": 0.17589472234249115, "timestamp": "2025-09-02 14:20:08.553238", "step": 3390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.620909", "step": 3390, "epoch": 3 }, { "type": "loss", "content": 0.10408948361873627, "timestamp": "2025-09-02 14:20:08.632422", "step": 3391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:08.690882", "step": 3391, "epoch": 3 }, { "type": "loss", "content": 0.1259026974439621, "timestamp": "2025-09-02 14:20:08.703205", "step": 3392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.771191", "step": 3392, "epoch": 3 }, { "type": "loss", "content": 0.07189959287643433, "timestamp": "2025-09-02 14:20:08.776503", "step": 3393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:08.836184", "step": 3393, "epoch": 3 }, { "type": "loss", "content": 0.2521967887878418, "timestamp": "2025-09-02 14:20:08.839683", "step": 3394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:08.904113", "step": 3394, "epoch": 3 }, { "type": "loss", "content": 0.20600397884845734, "timestamp": "2025-09-02 14:20:08.922626", "step": 3395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:08.979372", "step": 3395, "epoch": 3 }, { "type": "loss", "content": 0.17472799122333527, "timestamp": "2025-09-02 14:20:08.990084", "step": 3396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:09.049922", "step": 3396, "epoch": 3 }, { "type": "loss", "content": 0.13816209137439728, "timestamp": "2025-09-02 14:20:09.059803", "step": 3397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:09.124935", "step": 3397, "epoch": 3 }, { "type": "loss", "content": 0.12172947824001312, "timestamp": "2025-09-02 14:20:09.130286", "step": 3398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:09.194721", "step": 3398, "epoch": 3 }, { "type": "loss", "content": 0.08827422559261322, "timestamp": "2025-09-02 14:20:09.201216", "step": 3399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:09.262664", "step": 3399, "epoch": 3 }, { "type": "loss", "content": 0.19538626074790955, "timestamp": "2025-09-02 14:20:09.276360", "step": 3400, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:11.344547", "step": 3400, "epoch": 3 }, { "type": "pplx", "content": 9172.200924528917, "timestamp": "2025-09-02 14:20:11.350216", "step": 3400, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3400", "timestamp": "2025-09-02 14:20:12.061103", "step": 3400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:12.124338", "step": 3400, "epoch": 3 }, { "type": "loss", "content": 0.10192326456308365, "timestamp": "2025-09-02 14:20:12.159177", "step": 3401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:12.242026", "step": 3401, "epoch": 3 }, { "type": "loss", "content": 0.11698740720748901, "timestamp": "2025-09-02 14:20:12.253344", "step": 3402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:12.314475", "step": 3402, "epoch": 3 }, { "type": "loss", "content": 0.2525162696838379, "timestamp": "2025-09-02 14:20:12.331805", "step": 3403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:12.396500", "step": 3403, "epoch": 3 }, { "type": "loss", "content": 0.08117562532424927, "timestamp": "2025-09-02 14:20:12.404363", "step": 3404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.466810", "step": 3404, "epoch": 3 }, { "type": "loss", "content": 0.18835866451263428, "timestamp": "2025-09-02 14:20:12.476095", "step": 3405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.531507", "step": 3405, "epoch": 3 }, { "type": "loss", "content": 0.13650912046432495, "timestamp": "2025-09-02 14:20:12.539343", "step": 3406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:12.597620", "step": 3406, "epoch": 3 }, { "type": "loss", "content": 0.19781775772571564, "timestamp": "2025-09-02 14:20:12.603827", "step": 3407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.664273", "step": 3407, "epoch": 3 }, { "type": "loss", "content": 0.14164642989635468, "timestamp": "2025-09-02 14:20:12.672312", "step": 3408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.727531", "step": 3408, "epoch": 3 }, { "type": "loss", "content": 0.14207929372787476, "timestamp": "2025-09-02 14:20:12.735869", "step": 3409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.806903", "step": 3409, "epoch": 3 }, { "type": "loss", "content": 0.22501739859580994, "timestamp": "2025-09-02 14:20:12.814910", "step": 3410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.875080", "step": 3410, "epoch": 3 }, { "type": "loss", "content": 0.06492775678634644, "timestamp": "2025-09-02 14:20:12.881102", "step": 3411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:12.939298", "step": 3411, "epoch": 3 }, { "type": "loss", "content": 0.19035297632217407, "timestamp": "2025-09-02 14:20:12.947722", "step": 3412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:13.010294", "step": 3412, "epoch": 3 }, { "type": "loss", "content": 0.19861091673374176, "timestamp": "2025-09-02 14:20:13.019770", "step": 3413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:13.086321", "step": 3413, "epoch": 3 }, { "type": "loss", "content": 0.0733867809176445, "timestamp": "2025-09-02 14:20:13.093215", "step": 3414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:13.156384", "step": 3414, "epoch": 3 }, { "type": "loss", "content": 0.1661188304424286, "timestamp": "2025-09-02 14:20:13.164668", "step": 3415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:13.222744", "step": 3415, "epoch": 3 }, { "type": "loss", "content": 0.23452314734458923, "timestamp": "2025-09-02 14:20:13.236358", "step": 3416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:13.295337", "step": 3416, "epoch": 3 }, { "type": "loss", "content": 0.22230270504951477, "timestamp": "2025-09-02 14:20:13.303713", "step": 3417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:13.364629", "step": 3417, "epoch": 3 }, { "type": "loss", "content": 0.11145441979169846, "timestamp": "2025-09-02 14:20:13.371880", "step": 3418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:13.437999", "step": 3418, "epoch": 3 }, { "type": "loss", "content": 0.18820130825042725, "timestamp": "2025-09-02 14:20:13.461741", "step": 3419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:13.550764", "step": 3419, "epoch": 3 }, { "type": "loss", "content": 0.21694742143154144, "timestamp": "2025-09-02 14:20:13.565833", "step": 3420, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:15.680845", "step": 3420, "epoch": 3 }, { "type": "pplx", "content": 9180.93766994988, "timestamp": "2025-09-02 14:20:15.684728", "step": 3420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:15.738848", "step": 3420, "epoch": 3 }, { "type": "loss", "content": 0.15007874369621277, "timestamp": "2025-09-02 14:20:15.740895", "step": 3421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:15.799172", "step": 3421, "epoch": 3 }, { "type": "loss", "content": 0.19776271283626556, "timestamp": "2025-09-02 14:20:15.801806", "step": 3422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:15.856264", "step": 3422, "epoch": 3 }, { "type": "loss", "content": 0.2287985384464264, "timestamp": "2025-09-02 14:20:15.858377", "step": 3423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:15.913975", "step": 3423, "epoch": 3 }, { "type": "loss", "content": 0.0951138511300087, "timestamp": "2025-09-02 14:20:15.920968", "step": 3424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:15.975533", "step": 3424, "epoch": 3 }, { "type": "loss", "content": 0.05588804557919502, "timestamp": "2025-09-02 14:20:15.977783", "step": 3425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.032984", "step": 3425, "epoch": 3 }, { "type": "loss", "content": 0.0684896931052208, "timestamp": "2025-09-02 14:20:16.035188", "step": 3426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.089383", "step": 3426, "epoch": 3 }, { "type": "loss", "content": 0.10977375507354736, "timestamp": "2025-09-02 14:20:16.091973", "step": 3427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.146665", "step": 3427, "epoch": 3 }, { "type": "loss", "content": 0.055285800248384476, "timestamp": "2025-09-02 14:20:16.153703", "step": 3428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.208823", "step": 3428, "epoch": 3 }, { "type": "loss", "content": 0.3721555173397064, "timestamp": "2025-09-02 14:20:16.211445", "step": 3429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.265994", "step": 3429, "epoch": 3 }, { "type": "loss", "content": 0.13048751652240753, "timestamp": "2025-09-02 14:20:16.268545", "step": 3430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.325695", "step": 3430, "epoch": 3 }, { "type": "loss", "content": 0.15587159991264343, "timestamp": "2025-09-02 14:20:16.328139", "step": 3431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.385881", "step": 3431, "epoch": 3 }, { "type": "loss", "content": 0.06931553781032562, "timestamp": "2025-09-02 14:20:16.392314", "step": 3432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:16.450231", "step": 3432, "epoch": 3 }, { "type": "loss", "content": 0.1729547083377838, "timestamp": "2025-09-02 14:20:16.453055", "step": 3433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.508643", "step": 3433, "epoch": 3 }, { "type": "loss", "content": 0.05881184712052345, "timestamp": "2025-09-02 14:20:16.511954", "step": 3434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.567905", "step": 3434, "epoch": 3 }, { "type": "loss", "content": 0.23186945915222168, "timestamp": "2025-09-02 14:20:16.570951", "step": 3435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.627949", "step": 3435, "epoch": 3 }, { "type": "loss", "content": 0.16533565521240234, "timestamp": "2025-09-02 14:20:16.634770", "step": 3436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.688567", "step": 3436, "epoch": 3 }, { "type": "loss", "content": 0.15686188638210297, "timestamp": "2025-09-02 14:20:16.690927", "step": 3437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:16.746257", "step": 3437, "epoch": 3 }, { "type": "loss", "content": 0.11382665485143661, "timestamp": "2025-09-02 14:20:16.750012", "step": 3438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:16.805955", "step": 3438, "epoch": 3 }, { "type": "loss", "content": 0.08850570023059845, "timestamp": "2025-09-02 14:20:16.808540", "step": 3439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:16.864700", "step": 3439, "epoch": 3 }, { "type": "loss", "content": 0.17512118816375732, "timestamp": "2025-09-02 14:20:16.871217", "step": 3440, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:18.902922", "step": 3440, "epoch": 3 }, { "type": "pplx", "content": 9124.744784624465, "timestamp": "2025-09-02 14:20:18.905508", "step": 3440, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3440", "timestamp": "2025-09-02 14:20:19.337528", "step": 3440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.393661", "step": 3440, "epoch": 3 }, { "type": "loss", "content": 0.24381820857524872, "timestamp": "2025-09-02 14:20:19.396250", "step": 3441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.452401", "step": 3441, "epoch": 3 }, { "type": "loss", "content": 0.25540369749069214, "timestamp": "2025-09-02 14:20:19.454779", "step": 3442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.516365", "step": 3442, "epoch": 3 }, { "type": "loss", "content": 0.04572911560535431, "timestamp": "2025-09-02 14:20:19.518898", "step": 3443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.573988", "step": 3443, "epoch": 3 }, { "type": "loss", "content": 0.19490832090377808, "timestamp": "2025-09-02 14:20:19.582213", "step": 3444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.638884", "step": 3444, "epoch": 3 }, { "type": "loss", "content": 0.16846472024917603, "timestamp": "2025-09-02 14:20:19.642708", "step": 3445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.698259", "step": 3445, "epoch": 3 }, { "type": "loss", "content": 0.17405490577220917, "timestamp": "2025-09-02 14:20:19.700810", "step": 3446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:19.756130", "step": 3446, "epoch": 3 }, { "type": "loss", "content": 0.06081828474998474, "timestamp": "2025-09-02 14:20:19.758994", "step": 3447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.814179", "step": 3447, "epoch": 3 }, { "type": "loss", "content": 0.13247986137866974, "timestamp": "2025-09-02 14:20:19.820222", "step": 3448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:19.883605", "step": 3448, "epoch": 3 }, { "type": "loss", "content": 0.18327902257442474, "timestamp": "2025-09-02 14:20:19.886219", "step": 3449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:19.946282", "step": 3449, "epoch": 3 }, { "type": "loss", "content": 0.10492650419473648, "timestamp": "2025-09-02 14:20:19.948910", "step": 3450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.007898", "step": 3450, "epoch": 3 }, { "type": "loss", "content": 0.12326741218566895, "timestamp": "2025-09-02 14:20:20.010197", "step": 3451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.065191", "step": 3451, "epoch": 3 }, { "type": "loss", "content": 0.1656007170677185, "timestamp": "2025-09-02 14:20:20.071868", "step": 3452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.126925", "step": 3452, "epoch": 3 }, { "type": "loss", "content": 0.14158208668231964, "timestamp": "2025-09-02 14:20:20.128871", "step": 3453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.183810", "step": 3453, "epoch": 3 }, { "type": "loss", "content": 0.16918891668319702, "timestamp": "2025-09-02 14:20:20.186932", "step": 3454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.242259", "step": 3454, "epoch": 3 }, { "type": "loss", "content": 0.15775910019874573, "timestamp": "2025-09-02 14:20:20.244566", "step": 3455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:20.299891", "step": 3455, "epoch": 3 }, { "type": "loss", "content": 0.2573939561843872, "timestamp": "2025-09-02 14:20:20.306161", "step": 3456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.361429", "step": 3456, "epoch": 3 }, { "type": "loss", "content": 0.14835967123508453, "timestamp": "2025-09-02 14:20:20.363799", "step": 3457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:20.418806", "step": 3457, "epoch": 3 }, { "type": "loss", "content": 0.10356303304433823, "timestamp": "2025-09-02 14:20:20.421339", "step": 3458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:20.476450", "step": 3458, "epoch": 3 }, { "type": "loss", "content": 0.0949944332242012, "timestamp": "2025-09-02 14:20:20.481001", "step": 3459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:20.538777", "step": 3459, "epoch": 3 }, { "type": "loss", "content": 0.13792981207370758, "timestamp": "2025-09-02 14:20:20.545485", "step": 3460, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:22.449804", "step": 3460, "epoch": 3 }, { "type": "pplx", "content": 8789.242146694272, "timestamp": "2025-09-02 14:20:22.452351", "step": 3460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:22.505408", "step": 3460, "epoch": 3 }, { "type": "loss", "content": 0.17804162204265594, "timestamp": "2025-09-02 14:20:22.507999", "step": 3461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.561993", "step": 3461, "epoch": 3 }, { "type": "loss", "content": 0.20596924424171448, "timestamp": "2025-09-02 14:20:22.564648", "step": 3462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.619523", "step": 3462, "epoch": 3 }, { "type": "loss", "content": 0.1456899493932724, "timestamp": "2025-09-02 14:20:22.622264", "step": 3463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:22.675821", "step": 3463, "epoch": 3 }, { "type": "loss", "content": 0.10939262807369232, "timestamp": "2025-09-02 14:20:22.682316", "step": 3464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.734734", "step": 3464, "epoch": 3 }, { "type": "loss", "content": 0.12812358140945435, "timestamp": "2025-09-02 14:20:22.737394", "step": 3465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.790837", "step": 3465, "epoch": 3 }, { "type": "loss", "content": 0.17759793996810913, "timestamp": "2025-09-02 14:20:22.793323", "step": 3466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.846212", "step": 3466, "epoch": 3 }, { "type": "loss", "content": 0.077081099152565, "timestamp": "2025-09-02 14:20:22.848897", "step": 3467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.903176", "step": 3467, "epoch": 3 }, { "type": "loss", "content": 0.14934103190898895, "timestamp": "2025-09-02 14:20:22.909534", "step": 3468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:22.962007", "step": 3468, "epoch": 3 }, { "type": "loss", "content": 0.04450749605894089, "timestamp": "2025-09-02 14:20:22.964496", "step": 3469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.018716", "step": 3469, "epoch": 3 }, { "type": "loss", "content": 0.109067901968956, "timestamp": "2025-09-02 14:20:23.021057", "step": 3470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:23.075183", "step": 3470, "epoch": 3 }, { "type": "loss", "content": 0.1517234593629837, "timestamp": "2025-09-02 14:20:23.078533", "step": 3471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.134317", "step": 3471, "epoch": 3 }, { "type": "loss", "content": 0.1877007931470871, "timestamp": "2025-09-02 14:20:23.140586", "step": 3472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.193938", "step": 3472, "epoch": 3 }, { "type": "loss", "content": 0.159190833568573, "timestamp": "2025-09-02 14:20:23.196077", "step": 3473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.249800", "step": 3473, "epoch": 3 }, { "type": "loss", "content": 0.1246003583073616, "timestamp": "2025-09-02 14:20:23.252044", "step": 3474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.304648", "step": 3474, "epoch": 3 }, { "type": "loss", "content": 0.23859207332134247, "timestamp": "2025-09-02 14:20:23.306968", "step": 3475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.360294", "step": 3475, "epoch": 3 }, { "type": "loss", "content": 0.152079775929451, "timestamp": "2025-09-02 14:20:23.366595", "step": 3476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.419330", "step": 3476, "epoch": 3 }, { "type": "loss", "content": 0.22129744291305542, "timestamp": "2025-09-02 14:20:23.421696", "step": 3477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.474680", "step": 3477, "epoch": 3 }, { "type": "loss", "content": 0.23861663043498993, "timestamp": "2025-09-02 14:20:23.476949", "step": 3478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.530550", "step": 3478, "epoch": 3 }, { "type": "loss", "content": 0.1336541622877121, "timestamp": "2025-09-02 14:20:23.533117", "step": 3479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:23.586043", "step": 3479, "epoch": 3 }, { "type": "loss", "content": 0.11395163834095001, "timestamp": "2025-09-02 14:20:23.591840", "step": 3480, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:25.528248", "step": 3480, "epoch": 3 }, { "type": "pplx", "content": 8732.40395712388, "timestamp": "2025-09-02 14:20:25.530742", "step": 3480, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3480", "timestamp": "2025-09-02 14:20:25.951778", "step": 3480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:26.008238", "step": 3480, "epoch": 3 }, { "type": "loss", "content": 0.13954435288906097, "timestamp": "2025-09-02 14:20:26.010389", "step": 3481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:26.065772", "step": 3481, "epoch": 3 }, { "type": "loss", "content": 0.26997488737106323, "timestamp": "2025-09-02 14:20:26.069113", "step": 3482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.123857", "step": 3482, "epoch": 3 }, { "type": "loss", "content": 0.1504383534193039, "timestamp": "2025-09-02 14:20:26.126308", "step": 3483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.179496", "step": 3483, "epoch": 3 }, { "type": "loss", "content": 0.2594529390335083, "timestamp": "2025-09-02 14:20:26.185640", "step": 3484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.238312", "step": 3484, "epoch": 3 }, { "type": "loss", "content": 0.2837013006210327, "timestamp": "2025-09-02 14:20:26.241068", "step": 3485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:26.294899", "step": 3485, "epoch": 3 }, { "type": "loss", "content": 0.1823764145374298, "timestamp": "2025-09-02 14:20:26.297645", "step": 3486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:26.351610", "step": 3486, "epoch": 3 }, { "type": "loss", "content": 0.15333575010299683, "timestamp": "2025-09-02 14:20:26.354184", "step": 3487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:26.408429", "step": 3487, "epoch": 3 }, { "type": "loss", "content": 0.1048944890499115, "timestamp": "2025-09-02 14:20:26.414364", "step": 3488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:26.471023", "step": 3488, "epoch": 3 }, { "type": "loss", "content": 0.11403868347406387, "timestamp": "2025-09-02 14:20:26.473509", "step": 3489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.527196", "step": 3489, "epoch": 3 }, { "type": "loss", "content": 0.09001341462135315, "timestamp": "2025-09-02 14:20:26.529954", "step": 3490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.594749", "step": 3490, "epoch": 3 }, { "type": "loss", "content": 0.15575838088989258, "timestamp": "2025-09-02 14:20:26.597423", "step": 3491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.651043", "step": 3491, "epoch": 3 }, { "type": "loss", "content": 0.07489461451768875, "timestamp": "2025-09-02 14:20:26.657382", "step": 3492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.710264", "step": 3492, "epoch": 3 }, { "type": "loss", "content": 0.12355247139930725, "timestamp": "2025-09-02 14:20:26.712855", "step": 3493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:26.765787", "step": 3493, "epoch": 3 }, { "type": "loss", "content": 0.0928218811750412, "timestamp": "2025-09-02 14:20:26.768252", "step": 3494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.821230", "step": 3494, "epoch": 3 }, { "type": "loss", "content": 0.15690888464450836, "timestamp": "2025-09-02 14:20:26.825088", "step": 3495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.880496", "step": 3495, "epoch": 3 }, { "type": "loss", "content": 0.24310503900051117, "timestamp": "2025-09-02 14:20:26.906075", "step": 3496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:26.975372", "step": 3496, "epoch": 3 }, { "type": "loss", "content": 0.15990443527698517, "timestamp": "2025-09-02 14:20:26.992891", "step": 3497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:27.048052", "step": 3497, "epoch": 3 }, { "type": "loss", "content": 0.21127955615520477, "timestamp": "2025-09-02 14:20:27.052306", "step": 3498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:27.108117", "step": 3498, "epoch": 3 }, { "type": "loss", "content": 0.03300917148590088, "timestamp": "2025-09-02 14:20:27.112212", "step": 3499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:27.166744", "step": 3499, "epoch": 3 }, { "type": "loss", "content": 0.2550016939640045, "timestamp": "2025-09-02 14:20:27.174377", "step": 3500, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:29.126714", "step": 3500, "epoch": 3 }, { "type": "pplx", "content": 8626.073392683003, "timestamp": "2025-09-02 14:20:29.129898", "step": 3500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.184847", "step": 3500, "epoch": 3 }, { "type": "loss", "content": 0.05501219630241394, "timestamp": "2025-09-02 14:20:29.192589", "step": 3501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.248616", "step": 3501, "epoch": 3 }, { "type": "loss", "content": 0.13475798070430756, "timestamp": "2025-09-02 14:20:29.252354", "step": 3502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:29.309948", "step": 3502, "epoch": 3 }, { "type": "loss", "content": 0.1130293533205986, "timestamp": "2025-09-02 14:20:29.311941", "step": 3503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.366538", "step": 3503, "epoch": 3 }, { "type": "loss", "content": 0.11264336109161377, "timestamp": "2025-09-02 14:20:29.373330", "step": 3504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.427036", "step": 3504, "epoch": 3 }, { "type": "loss", "content": 0.26652657985687256, "timestamp": "2025-09-02 14:20:29.430159", "step": 3505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:29.485430", "step": 3505, "epoch": 3 }, { "type": "loss", "content": 0.12727883458137512, "timestamp": "2025-09-02 14:20:29.488316", "step": 3506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.542717", "step": 3506, "epoch": 3 }, { "type": "loss", "content": 0.16097907721996307, "timestamp": "2025-09-02 14:20:29.545719", "step": 3507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.601091", "step": 3507, "epoch": 3 }, { "type": "loss", "content": 0.1401330530643463, "timestamp": "2025-09-02 14:20:29.608326", "step": 3508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.662763", "step": 3508, "epoch": 3 }, { "type": "loss", "content": 0.15142624080181122, "timestamp": "2025-09-02 14:20:29.665935", "step": 3509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:29.720229", "step": 3509, "epoch": 3 }, { "type": "loss", "content": 0.07802019268274307, "timestamp": "2025-09-02 14:20:29.724004", "step": 3510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.783130", "step": 3510, "epoch": 3 }, { "type": "loss", "content": 0.20275968313217163, "timestamp": "2025-09-02 14:20:29.786599", "step": 3511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.848382", "step": 3511, "epoch": 3 }, { "type": "loss", "content": 0.10130272805690765, "timestamp": "2025-09-02 14:20:29.854849", "step": 3512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:29.907760", "step": 3512, "epoch": 3 }, { "type": "loss", "content": 0.165353462100029, "timestamp": "2025-09-02 14:20:29.910404", "step": 3513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:29.964467", "step": 3513, "epoch": 3 }, { "type": "loss", "content": 0.16357100009918213, "timestamp": "2025-09-02 14:20:29.966985", "step": 3514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:30.021148", "step": 3514, "epoch": 3 }, { "type": "loss", "content": 0.1423690766096115, "timestamp": "2025-09-02 14:20:30.023896", "step": 3515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:30.077533", "step": 3515, "epoch": 3 }, { "type": "loss", "content": 0.07039965689182281, "timestamp": "2025-09-02 14:20:30.084414", "step": 3516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 3200019493504.0 }, "timestamp": "2025-09-02 14:20:30.138898", "step": 3516, "epoch": 3 }, { "type": "loss", "content": 0.2745664417743683, "timestamp": "2025-09-02 14:20:30.146432", "step": 3517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:30.206079", "step": 3517, "epoch": 3 }, { "type": "loss", "content": 0.1024094894528389, "timestamp": "2025-09-02 14:20:30.210012", "step": 3518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:30.265385", "step": 3518, "epoch": 3 }, { "type": "loss", "content": 0.16862830519676208, "timestamp": "2025-09-02 14:20:30.267699", "step": 3519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:30.323669", "step": 3519, "epoch": 3 }, { "type": "loss", "content": 0.10296670347452164, "timestamp": "2025-09-02 14:20:30.330160", "step": 3520, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:32.235516", "step": 3520, "epoch": 3 }, { "type": "pplx", "content": 8764.778809792064, "timestamp": "2025-09-02 14:20:32.237896", "step": 3520, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3520", "timestamp": "2025-09-02 14:20:32.638499", "step": 3520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:32.697161", "step": 3520, "epoch": 3 }, { "type": "loss", "content": 0.13131654262542725, "timestamp": "2025-09-02 14:20:32.699962", "step": 3521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:32.756533", "step": 3521, "epoch": 3 }, { "type": "loss", "content": 0.10640706866979599, "timestamp": "2025-09-02 14:20:32.758686", "step": 3522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:32.813235", "step": 3522, "epoch": 3 }, { "type": "loss", "content": 0.11971314996480942, "timestamp": "2025-09-02 14:20:32.815835", "step": 3523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:32.871184", "step": 3523, "epoch": 3 }, { "type": "loss", "content": 0.35246753692626953, "timestamp": "2025-09-02 14:20:32.877357", "step": 3524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:32.930223", "step": 3524, "epoch": 3 }, { "type": "loss", "content": 0.3388194441795349, "timestamp": "2025-09-02 14:20:32.932226", "step": 3525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:32.985600", "step": 3525, "epoch": 3 }, { "type": "loss", "content": 0.12429800629615784, "timestamp": "2025-09-02 14:20:32.987538", "step": 3526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.040775", "step": 3526, "epoch": 3 }, { "type": "loss", "content": 0.19839924573898315, "timestamp": "2025-09-02 14:20:33.043103", "step": 3527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.096712", "step": 3527, "epoch": 3 }, { "type": "loss", "content": 0.17574745416641235, "timestamp": "2025-09-02 14:20:33.103068", "step": 3528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:33.158039", "step": 3528, "epoch": 3 }, { "type": "loss", "content": 0.11941372603178024, "timestamp": "2025-09-02 14:20:33.160738", "step": 3529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:33.214846", "step": 3529, "epoch": 3 }, { "type": "loss", "content": 0.11614133417606354, "timestamp": "2025-09-02 14:20:33.217424", "step": 3530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.271443", "step": 3530, "epoch": 3 }, { "type": "loss", "content": 0.11981576681137085, "timestamp": "2025-09-02 14:20:33.273791", "step": 3531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:33.328043", "step": 3531, "epoch": 3 }, { "type": "loss", "content": 0.06733594089746475, "timestamp": "2025-09-02 14:20:33.334316", "step": 3532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:33.387405", "step": 3532, "epoch": 3 }, { "type": "loss", "content": 0.2323663979768753, "timestamp": "2025-09-02 14:20:33.389350", "step": 3533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.442776", "step": 3533, "epoch": 3 }, { "type": "loss", "content": 0.2232254594564438, "timestamp": "2025-09-02 14:20:33.444773", "step": 3534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:33.498547", "step": 3534, "epoch": 3 }, { "type": "loss", "content": 0.15520073473453522, "timestamp": "2025-09-02 14:20:33.500815", "step": 3535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.553695", "step": 3535, "epoch": 3 }, { "type": "loss", "content": 0.20041003823280334, "timestamp": "2025-09-02 14:20:33.560806", "step": 3536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.613927", "step": 3536, "epoch": 3 }, { "type": "loss", "content": 0.15091851353645325, "timestamp": "2025-09-02 14:20:33.617271", "step": 3537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.672768", "step": 3537, "epoch": 3 }, { "type": "loss", "content": 0.08132489025592804, "timestamp": "2025-09-02 14:20:33.675143", "step": 3538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:33.729777", "step": 3538, "epoch": 3 }, { "type": "loss", "content": 0.21339409053325653, "timestamp": "2025-09-02 14:20:33.732096", "step": 3539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:33.785713", "step": 3539, "epoch": 3 }, { "type": "loss", "content": 0.22073611617088318, "timestamp": "2025-09-02 14:20:33.791892", "step": 3540, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:35.657809", "step": 3540, "epoch": 3 }, { "type": "pplx", "content": 8797.104819786919, "timestamp": "2025-09-02 14:20:35.660063", "step": 3540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:35.714128", "step": 3540, "epoch": 3 }, { "type": "loss", "content": 0.1063186377286911, "timestamp": "2025-09-02 14:20:35.716464", "step": 3541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:35.771666", "step": 3541, "epoch": 3 }, { "type": "loss", "content": 0.2221163660287857, "timestamp": "2025-09-02 14:20:35.773825", "step": 3542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:35.827740", "step": 3542, "epoch": 3 }, { "type": "loss", "content": 0.22000372409820557, "timestamp": "2025-09-02 14:20:35.830332", "step": 3543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:35.884347", "step": 3543, "epoch": 3 }, { "type": "loss", "content": 0.045791078358888626, "timestamp": "2025-09-02 14:20:35.890530", "step": 3544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:35.944017", "step": 3544, "epoch": 3 }, { "type": "loss", "content": 0.2908235788345337, "timestamp": "2025-09-02 14:20:35.946253", "step": 3545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:35.999440", "step": 3545, "epoch": 3 }, { "type": "loss", "content": 0.16208955645561218, "timestamp": "2025-09-02 14:20:36.001736", "step": 3546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.055181", "step": 3546, "epoch": 3 }, { "type": "loss", "content": 0.24574878811836243, "timestamp": "2025-09-02 14:20:36.057366", "step": 3547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:36.110815", "step": 3547, "epoch": 3 }, { "type": "loss", "content": 0.2544442117214203, "timestamp": "2025-09-02 14:20:36.117098", "step": 3548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.170651", "step": 3548, "epoch": 3 }, { "type": "loss", "content": 0.2649095058441162, "timestamp": "2025-09-02 14:20:36.172535", "step": 3549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:36.226112", "step": 3549, "epoch": 3 }, { "type": "loss", "content": 0.12039937824010849, "timestamp": "2025-09-02 14:20:36.228605", "step": 3550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.282137", "step": 3550, "epoch": 3 }, { "type": "loss", "content": 0.1418890506029129, "timestamp": "2025-09-02 14:20:36.283807", "step": 3551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:36.336825", "step": 3551, "epoch": 3 }, { "type": "loss", "content": 0.15298080444335938, "timestamp": "2025-09-02 14:20:36.342921", "step": 3552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:36.396578", "step": 3552, "epoch": 3 }, { "type": "loss", "content": 0.09700776636600494, "timestamp": "2025-09-02 14:20:36.398873", "step": 3553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.452861", "step": 3553, "epoch": 3 }, { "type": "loss", "content": 0.1680152267217636, "timestamp": "2025-09-02 14:20:36.455312", "step": 3554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.511259", "step": 3554, "epoch": 3 }, { "type": "loss", "content": 0.19704753160476685, "timestamp": "2025-09-02 14:20:36.513583", "step": 3555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.567752", "step": 3555, "epoch": 3 }, { "type": "loss", "content": 0.17575035989284515, "timestamp": "2025-09-02 14:20:36.573832", "step": 3556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.627279", "step": 3556, "epoch": 3 }, { "type": "loss", "content": 0.15965697169303894, "timestamp": "2025-09-02 14:20:36.629298", "step": 3557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.682012", "step": 3557, "epoch": 3 }, { "type": "loss", "content": 0.2394590824842453, "timestamp": "2025-09-02 14:20:36.683787", "step": 3558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.736820", "step": 3558, "epoch": 3 }, { "type": "loss", "content": 0.10877621173858643, "timestamp": "2025-09-02 14:20:36.739129", "step": 3559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:36.792434", "step": 3559, "epoch": 3 }, { "type": "loss", "content": 0.3062446117401123, "timestamp": "2025-09-02 14:20:36.798536", "step": 3560, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:38.807346", "step": 3560, "epoch": 3 }, { "type": "pplx", "content": 8618.983458616196, "timestamp": "2025-09-02 14:20:38.809933", "step": 3560, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3560", "timestamp": "2025-09-02 14:20:39.221413", "step": 3560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.279954", "step": 3560, "epoch": 3 }, { "type": "loss", "content": 0.08664455264806747, "timestamp": "2025-09-02 14:20:39.283209", "step": 3561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:39.340836", "step": 3561, "epoch": 3 }, { "type": "loss", "content": 0.09220175445079803, "timestamp": "2025-09-02 14:20:39.343536", "step": 3562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.398150", "step": 3562, "epoch": 3 }, { "type": "loss", "content": 0.10937684029340744, "timestamp": "2025-09-02 14:20:39.400771", "step": 3563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.457033", "step": 3563, "epoch": 3 }, { "type": "loss", "content": 0.10463779419660568, "timestamp": "2025-09-02 14:20:39.463826", "step": 3564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:39.518047", "step": 3564, "epoch": 3 }, { "type": "loss", "content": 0.14029651880264282, "timestamp": "2025-09-02 14:20:39.520599", "step": 3565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.574407", "step": 3565, "epoch": 3 }, { "type": "loss", "content": 0.11889435350894928, "timestamp": "2025-09-02 14:20:39.576934", "step": 3566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.631102", "step": 3566, "epoch": 3 }, { "type": "loss", "content": 0.16638930141925812, "timestamp": "2025-09-02 14:20:39.634013", "step": 3567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.688029", "step": 3567, "epoch": 3 }, { "type": "loss", "content": 0.19709818065166473, "timestamp": "2025-09-02 14:20:39.694413", "step": 3568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.747285", "step": 3568, "epoch": 3 }, { "type": "loss", "content": 0.09710143506526947, "timestamp": "2025-09-02 14:20:39.750196", "step": 3569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.803712", "step": 3569, "epoch": 3 }, { "type": "loss", "content": 0.1175459697842598, "timestamp": "2025-09-02 14:20:39.805936", "step": 3570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:39.860487", "step": 3570, "epoch": 3 }, { "type": "loss", "content": 0.18060153722763062, "timestamp": "2025-09-02 14:20:39.863318", "step": 3571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:39.917442", "step": 3571, "epoch": 3 }, { "type": "loss", "content": 0.11122419685125351, "timestamp": "2025-09-02 14:20:39.924053", "step": 3572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:39.977027", "step": 3572, "epoch": 3 }, { "type": "loss", "content": 0.11243066191673279, "timestamp": "2025-09-02 14:20:39.979399", "step": 3573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.034079", "step": 3573, "epoch": 3 }, { "type": "loss", "content": 0.1275808960199356, "timestamp": "2025-09-02 14:20:40.036755", "step": 3574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.090274", "step": 3574, "epoch": 3 }, { "type": "loss", "content": 0.18265140056610107, "timestamp": "2025-09-02 14:20:40.092739", "step": 3575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.147546", "step": 3575, "epoch": 3 }, { "type": "loss", "content": 0.11113951355218887, "timestamp": "2025-09-02 14:20:40.154016", "step": 3576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.206582", "step": 3576, "epoch": 3 }, { "type": "loss", "content": 0.2212965339422226, "timestamp": "2025-09-02 14:20:40.209498", "step": 3577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.262391", "step": 3577, "epoch": 3 }, { "type": "loss", "content": 0.13503868877887726, "timestamp": "2025-09-02 14:20:40.264899", "step": 3578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:40.318489", "step": 3578, "epoch": 3 }, { "type": "loss", "content": 0.26617124676704407, "timestamp": "2025-09-02 14:20:40.320835", "step": 3579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:40.374543", "step": 3579, "epoch": 3 }, { "type": "loss", "content": 0.14383432269096375, "timestamp": "2025-09-02 14:20:40.380920", "step": 3580, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:42.282345", "step": 3580, "epoch": 3 }, { "type": "pplx", "content": 8364.156683608908, "timestamp": "2025-09-02 14:20:42.284669", "step": 3580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.338557", "step": 3580, "epoch": 3 }, { "type": "loss", "content": 0.14319241046905518, "timestamp": "2025-09-02 14:20:42.340950", "step": 3581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.396822", "step": 3581, "epoch": 3 }, { "type": "loss", "content": 0.0763726532459259, "timestamp": "2025-09-02 14:20:42.399859", "step": 3582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.453949", "step": 3582, "epoch": 3 }, { "type": "loss", "content": 0.1490836888551712, "timestamp": "2025-09-02 14:20:42.456543", "step": 3583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:42.510795", "step": 3583, "epoch": 3 }, { "type": "loss", "content": 0.19556012749671936, "timestamp": "2025-09-02 14:20:42.517100", "step": 3584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.569867", "step": 3584, "epoch": 3 }, { "type": "loss", "content": 0.17671485245227814, "timestamp": "2025-09-02 14:20:42.572143", "step": 3585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.625604", "step": 3585, "epoch": 3 }, { "type": "loss", "content": 0.13170231878757477, "timestamp": "2025-09-02 14:20:42.627870", "step": 3586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.681343", "step": 3586, "epoch": 3 }, { "type": "loss", "content": 0.14915163815021515, "timestamp": "2025-09-02 14:20:42.683926", "step": 3587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.737706", "step": 3587, "epoch": 3 }, { "type": "loss", "content": 0.12623657286167145, "timestamp": "2025-09-02 14:20:42.744192", "step": 3588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.796645", "step": 3588, "epoch": 3 }, { "type": "loss", "content": 0.1774464100599289, "timestamp": "2025-09-02 14:20:42.798882", "step": 3589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.853094", "step": 3589, "epoch": 3 }, { "type": "loss", "content": 0.11918822675943375, "timestamp": "2025-09-02 14:20:42.855547", "step": 3590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.908656", "step": 3590, "epoch": 3 }, { "type": "loss", "content": 0.2602778375148773, "timestamp": "2025-09-02 14:20:42.910965", "step": 3591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:42.964516", "step": 3591, "epoch": 3 }, { "type": "loss", "content": 0.1830679029226303, "timestamp": "2025-09-02 14:20:42.970580", "step": 3592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.023565", "step": 3592, "epoch": 3 }, { "type": "loss", "content": 0.14590512216091156, "timestamp": "2025-09-02 14:20:43.025818", "step": 3593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.078865", "step": 3593, "epoch": 3 }, { "type": "loss", "content": 0.0860469713807106, "timestamp": "2025-09-02 14:20:43.081201", "step": 3594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.134228", "step": 3594, "epoch": 3 }, { "type": "loss", "content": 0.23915337026119232, "timestamp": "2025-09-02 14:20:43.137076", "step": 3595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.190971", "step": 3595, "epoch": 3 }, { "type": "loss", "content": 0.16491645574569702, "timestamp": "2025-09-02 14:20:43.196937", "step": 3596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.249880", "step": 3596, "epoch": 3 }, { "type": "loss", "content": 0.2106560468673706, "timestamp": "2025-09-02 14:20:43.252231", "step": 3597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.305299", "step": 3597, "epoch": 3 }, { "type": "loss", "content": 0.3316024839878082, "timestamp": "2025-09-02 14:20:43.307786", "step": 3598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.362325", "step": 3598, "epoch": 3 }, { "type": "loss", "content": 0.15912622213363647, "timestamp": "2025-09-02 14:20:43.364858", "step": 3599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:43.417763", "step": 3599, "epoch": 3 }, { "type": "loss", "content": 0.14967778325080872, "timestamp": "2025-09-02 14:20:43.424172", "step": 3600, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:45.311312", "step": 3600, "epoch": 3 }, { "type": "pplx", "content": 8147.246747115582, "timestamp": "2025-09-02 14:20:45.313557", "step": 3600, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3600", "timestamp": "2025-09-02 14:20:45.719234", "step": 3600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:45.776995", "step": 3600, "epoch": 3 }, { "type": "loss", "content": 0.12464648485183716, "timestamp": "2025-09-02 14:20:45.782345", "step": 3601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:45.839357", "step": 3601, "epoch": 3 }, { "type": "loss", "content": 0.14495882391929626, "timestamp": "2025-09-02 14:20:45.846849", "step": 3602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:45.902154", "step": 3602, "epoch": 3 }, { "type": "loss", "content": 0.13079111278057098, "timestamp": "2025-09-02 14:20:45.904760", "step": 3603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:45.960733", "step": 3603, "epoch": 3 }, { "type": "loss", "content": 0.1608773022890091, "timestamp": "2025-09-02 14:20:45.969074", "step": 3604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.028750", "step": 3604, "epoch": 3 }, { "type": "loss", "content": 0.1101132407784462, "timestamp": "2025-09-02 14:20:46.035974", "step": 3605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.097371", "step": 3605, "epoch": 3 }, { "type": "loss", "content": 0.09068082273006439, "timestamp": "2025-09-02 14:20:46.103246", "step": 3606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:46.158283", "step": 3606, "epoch": 3 }, { "type": "loss", "content": 0.10748220235109329, "timestamp": "2025-09-02 14:20:46.162766", "step": 3607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.226444", "step": 3607, "epoch": 3 }, { "type": "loss", "content": 0.2618456482887268, "timestamp": "2025-09-02 14:20:46.233062", "step": 3608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.286420", "step": 3608, "epoch": 3 }, { "type": "loss", "content": 0.22257065773010254, "timestamp": "2025-09-02 14:20:46.289402", "step": 3609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.348302", "step": 3609, "epoch": 3 }, { "type": "loss", "content": 0.1414661854505539, "timestamp": "2025-09-02 14:20:46.351945", "step": 3610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.409123", "step": 3610, "epoch": 3 }, { "type": "loss", "content": 0.1486176699399948, "timestamp": "2025-09-02 14:20:46.418779", "step": 3611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:46.482732", "step": 3611, "epoch": 3 }, { "type": "loss", "content": 0.11845646798610687, "timestamp": "2025-09-02 14:20:46.489941", "step": 3612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.542953", "step": 3612, "epoch": 3 }, { "type": "loss", "content": 0.14507809281349182, "timestamp": "2025-09-02 14:20:46.559460", "step": 3613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.617136", "step": 3613, "epoch": 3 }, { "type": "loss", "content": 0.080739825963974, "timestamp": "2025-09-02 14:20:46.630358", "step": 3614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.686829", "step": 3614, "epoch": 3 }, { "type": "loss", "content": 0.1285288780927658, "timestamp": "2025-09-02 14:20:46.689510", "step": 3615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.744189", "step": 3615, "epoch": 3 }, { "type": "loss", "content": 0.09114041179418564, "timestamp": "2025-09-02 14:20:46.751028", "step": 3616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.808241", "step": 3616, "epoch": 3 }, { "type": "loss", "content": 0.08040418475866318, "timestamp": "2025-09-02 14:20:46.814245", "step": 3617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:46.868785", "step": 3617, "epoch": 3 }, { "type": "loss", "content": 0.16049407422542572, "timestamp": "2025-09-02 14:20:46.871857", "step": 3618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:46.926011", "step": 3618, "epoch": 3 }, { "type": "loss", "content": 0.16242286562919617, "timestamp": "2025-09-02 14:20:46.933842", "step": 3619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:46.998001", "step": 3619, "epoch": 3 }, { "type": "loss", "content": 0.1351245641708374, "timestamp": "2025-09-02 14:20:47.004161", "step": 3620, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:49.242313", "step": 3620, "epoch": 3 }, { "type": "pplx", "content": 8306.515941882393, "timestamp": "2025-09-02 14:20:49.244921", "step": 3620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:49.299551", "step": 3620, "epoch": 3 }, { "type": "loss", "content": 0.16551660001277924, "timestamp": "2025-09-02 14:20:49.302743", "step": 3621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.359009", "step": 3621, "epoch": 3 }, { "type": "loss", "content": 0.20716683566570282, "timestamp": "2025-09-02 14:20:49.364540", "step": 3622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.422975", "step": 3622, "epoch": 3 }, { "type": "loss", "content": 0.19423946738243103, "timestamp": "2025-09-02 14:20:49.425551", "step": 3623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.483240", "step": 3623, "epoch": 3 }, { "type": "loss", "content": 0.11765499413013458, "timestamp": "2025-09-02 14:20:49.490383", "step": 3624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.545883", "step": 3624, "epoch": 3 }, { "type": "loss", "content": 0.15637758374214172, "timestamp": "2025-09-02 14:20:49.549217", "step": 3625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.602327", "step": 3625, "epoch": 3 }, { "type": "loss", "content": 0.18256960809230804, "timestamp": "2025-09-02 14:20:49.604348", "step": 3626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.658309", "step": 3626, "epoch": 3 }, { "type": "loss", "content": 0.1253015398979187, "timestamp": "2025-09-02 14:20:49.660613", "step": 3627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.713803", "step": 3627, "epoch": 3 }, { "type": "loss", "content": 0.12677687406539917, "timestamp": "2025-09-02 14:20:49.720322", "step": 3628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:49.774426", "step": 3628, "epoch": 3 }, { "type": "loss", "content": 0.16127942502498627, "timestamp": "2025-09-02 14:20:49.776871", "step": 3629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.831213", "step": 3629, "epoch": 3 }, { "type": "loss", "content": 0.1883133053779602, "timestamp": "2025-09-02 14:20:49.833968", "step": 3630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.887413", "step": 3630, "epoch": 3 }, { "type": "loss", "content": 0.10743749141693115, "timestamp": "2025-09-02 14:20:49.890438", "step": 3631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:49.944209", "step": 3631, "epoch": 3 }, { "type": "loss", "content": 0.15701448917388916, "timestamp": "2025-09-02 14:20:49.951004", "step": 3632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:50.004383", "step": 3632, "epoch": 3 }, { "type": "loss", "content": 0.18966755270957947, "timestamp": "2025-09-02 14:20:50.006524", "step": 3633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:50.060866", "step": 3633, "epoch": 3 }, { "type": "loss", "content": 0.1213853508234024, "timestamp": "2025-09-02 14:20:50.067442", "step": 3634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:50.125003", "step": 3634, "epoch": 3 }, { "type": "loss", "content": 0.1097906306385994, "timestamp": "2025-09-02 14:20:50.128078", "step": 3635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:50.183665", "step": 3635, "epoch": 3 }, { "type": "loss", "content": 0.14998866617679596, "timestamp": "2025-09-02 14:20:50.207171", "step": 3636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:50.270966", "step": 3636, "epoch": 3 }, { "type": "loss", "content": 0.13623636960983276, "timestamp": "2025-09-02 14:20:50.273043", "step": 3637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:50.327407", "step": 3637, "epoch": 3 }, { "type": "loss", "content": 0.10416514426469803, "timestamp": "2025-09-02 14:20:50.330540", "step": 3638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:50.390293", "step": 3638, "epoch": 3 }, { "type": "loss", "content": 0.17343318462371826, "timestamp": "2025-09-02 14:20:50.392360", "step": 3639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:50.454514", "step": 3639, "epoch": 3 }, { "type": "loss", "content": 0.0688050165772438, "timestamp": "2025-09-02 14:20:50.460834", "step": 3640, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:52.686564", "step": 3640, "epoch": 3 }, { "type": "pplx", "content": 8546.2353214014, "timestamp": "2025-09-02 14:20:52.693778", "step": 3640, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3640", "timestamp": "2025-09-02 14:20:53.128631", "step": 3640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.188609", "step": 3640, "epoch": 3 }, { "type": "loss", "content": 0.22403809428215027, "timestamp": "2025-09-02 14:20:53.191122", "step": 3641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:53.249493", "step": 3641, "epoch": 3 }, { "type": "loss", "content": 0.25828099250793457, "timestamp": "2025-09-02 14:20:53.252033", "step": 3642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.316171", "step": 3642, "epoch": 3 }, { "type": "loss", "content": 0.12254045903682709, "timestamp": "2025-09-02 14:20:53.318772", "step": 3643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:53.373928", "step": 3643, "epoch": 3 }, { "type": "loss", "content": 0.1377359926700592, "timestamp": "2025-09-02 14:20:53.380412", "step": 3644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.439741", "step": 3644, "epoch": 3 }, { "type": "loss", "content": 0.2463141828775406, "timestamp": "2025-09-02 14:20:53.441999", "step": 3645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.499425", "step": 3645, "epoch": 3 }, { "type": "loss", "content": 0.16273722052574158, "timestamp": "2025-09-02 14:20:53.501942", "step": 3646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.558999", "step": 3646, "epoch": 3 }, { "type": "loss", "content": 0.08663361519575119, "timestamp": "2025-09-02 14:20:53.562057", "step": 3647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:53.624883", "step": 3647, "epoch": 3 }, { "type": "loss", "content": 0.1544448882341385, "timestamp": "2025-09-02 14:20:53.634064", "step": 3648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.692536", "step": 3648, "epoch": 3 }, { "type": "loss", "content": 0.11112189292907715, "timestamp": "2025-09-02 14:20:53.697018", "step": 3649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.760031", "step": 3649, "epoch": 3 }, { "type": "loss", "content": 0.139377623796463, "timestamp": "2025-09-02 14:20:53.767497", "step": 3650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:53.833910", "step": 3650, "epoch": 3 }, { "type": "loss", "content": 0.1691487580537796, "timestamp": "2025-09-02 14:20:53.840994", "step": 3651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:53.920031", "step": 3651, "epoch": 3 }, { "type": "loss", "content": 0.16070489585399628, "timestamp": "2025-09-02 14:20:53.937198", "step": 3652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:54.017474", "step": 3652, "epoch": 3 }, { "type": "loss", "content": 0.11833713203668594, "timestamp": "2025-09-02 14:20:54.026425", "step": 3653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.125563", "step": 3653, "epoch": 3 }, { "type": "loss", "content": 0.2750966250896454, "timestamp": "2025-09-02 14:20:54.135335", "step": 3654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.206976", "step": 3654, "epoch": 3 }, { "type": "loss", "content": 0.20160692930221558, "timestamp": "2025-09-02 14:20:54.216464", "step": 3655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.296507", "step": 3655, "epoch": 3 }, { "type": "loss", "content": 0.25443655252456665, "timestamp": "2025-09-02 14:20:54.303875", "step": 3656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.379327", "step": 3656, "epoch": 3 }, { "type": "loss", "content": 0.19401030242443085, "timestamp": "2025-09-02 14:20:54.385993", "step": 3657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:54.469152", "step": 3657, "epoch": 3 }, { "type": "loss", "content": 0.21229703724384308, "timestamp": "2025-09-02 14:20:54.477937", "step": 3658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.540596", "step": 3658, "epoch": 3 }, { "type": "loss", "content": 0.1386672854423523, "timestamp": "2025-09-02 14:20:54.561644", "step": 3659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:54.629510", "step": 3659, "epoch": 3 }, { "type": "loss", "content": 0.07093927264213562, "timestamp": "2025-09-02 14:20:54.645331", "step": 3660, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:20:57.916196", "step": 3660, "epoch": 3 }, { "type": "pplx", "content": 8448.318340335058, "timestamp": "2025-09-02 14:20:57.928442", "step": 3660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:57.994683", "step": 3660, "epoch": 3 }, { "type": "loss", "content": 0.10353728383779526, "timestamp": "2025-09-02 14:20:58.004923", "step": 3661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:58.074332", "step": 3661, "epoch": 3 }, { "type": "loss", "content": 0.3243357539176941, "timestamp": "2025-09-02 14:20:58.081263", "step": 3662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.150711", "step": 3662, "epoch": 3 }, { "type": "loss", "content": 0.1793522834777832, "timestamp": "2025-09-02 14:20:58.162899", "step": 3663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.235807", "step": 3663, "epoch": 3 }, { "type": "loss", "content": 0.22577433288097382, "timestamp": "2025-09-02 14:20:58.259050", "step": 3664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:58.328232", "step": 3664, "epoch": 3 }, { "type": "loss", "content": 0.12639650702476501, "timestamp": "2025-09-02 14:20:58.342702", "step": 3665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:58.426794", "step": 3665, "epoch": 3 }, { "type": "loss", "content": 0.1593439131975174, "timestamp": "2025-09-02 14:20:58.432210", "step": 3666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:20:58.514227", "step": 3666, "epoch": 3 }, { "type": "loss", "content": 0.18187543749809265, "timestamp": "2025-09-02 14:20:58.528027", "step": 3667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.615433", "step": 3667, "epoch": 3 }, { "type": "loss", "content": 0.12421228736639023, "timestamp": "2025-09-02 14:20:58.630132", "step": 3668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.714012", "step": 3668, "epoch": 3 }, { "type": "loss", "content": 0.19265292584896088, "timestamp": "2025-09-02 14:20:58.727280", "step": 3669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.818275", "step": 3669, "epoch": 3 }, { "type": "loss", "content": 0.24151933193206787, "timestamp": "2025-09-02 14:20:58.827718", "step": 3670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.905855", "step": 3670, "epoch": 3 }, { "type": "loss", "content": 0.08623940497636795, "timestamp": "2025-09-02 14:20:58.911569", "step": 3671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:58.982174", "step": 3671, "epoch": 3 }, { "type": "loss", "content": 0.08821509033441544, "timestamp": "2025-09-02 14:20:58.999840", "step": 3672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.072464", "step": 3672, "epoch": 3 }, { "type": "loss", "content": 0.25542399287223816, "timestamp": "2025-09-02 14:20:59.081362", "step": 3673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.155802", "step": 3673, "epoch": 3 }, { "type": "loss", "content": 0.18636076152324677, "timestamp": "2025-09-02 14:20:59.182097", "step": 3674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:20:59.283172", "step": 3674, "epoch": 3 }, { "type": "loss", "content": 0.22596494853496552, "timestamp": "2025-09-02 14:20:59.290794", "step": 3675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.358954", "step": 3675, "epoch": 3 }, { "type": "loss", "content": 0.06921659409999847, "timestamp": "2025-09-02 14:20:59.367203", "step": 3676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.444576", "step": 3676, "epoch": 3 }, { "type": "loss", "content": 0.13785046339035034, "timestamp": "2025-09-02 14:20:59.454386", "step": 3677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.541967", "step": 3677, "epoch": 3 }, { "type": "loss", "content": 0.13382503390312195, "timestamp": "2025-09-02 14:20:59.553229", "step": 3678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.625689", "step": 3678, "epoch": 3 }, { "type": "loss", "content": 0.22102443873882294, "timestamp": "2025-09-02 14:20:59.640842", "step": 3679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:20:59.731082", "step": 3679, "epoch": 3 }, { "type": "loss", "content": 0.12405215203762054, "timestamp": "2025-09-02 14:20:59.741732", "step": 3680, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:03.246355", "step": 3680, "epoch": 3 }, { "type": "pplx", "content": 8251.469942508844, "timestamp": "2025-09-02 14:21:03.259933", "step": 3680, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3680", "timestamp": "2025-09-02 14:21:03.773385", "step": 3680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:03.847728", "step": 3680, "epoch": 3 }, { "type": "loss", "content": 0.1706307977437973, "timestamp": "2025-09-02 14:21:03.851923", "step": 3681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:03.921740", "step": 3681, "epoch": 3 }, { "type": "loss", "content": 0.1353771835565567, "timestamp": "2025-09-02 14:21:03.937614", "step": 3682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.034630", "step": 3682, "epoch": 3 }, { "type": "loss", "content": 0.17135682702064514, "timestamp": "2025-09-02 14:21:04.039021", "step": 3683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.094473", "step": 3683, "epoch": 3 }, { "type": "loss", "content": 0.13443443179130554, "timestamp": "2025-09-02 14:21:04.102240", "step": 3684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.156465", "step": 3684, "epoch": 3 }, { "type": "loss", "content": 0.19300296902656555, "timestamp": "2025-09-02 14:21:04.170709", "step": 3685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:04.237202", "step": 3685, "epoch": 3 }, { "type": "loss", "content": 0.12741932272911072, "timestamp": "2025-09-02 14:21:04.241071", "step": 3686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.298720", "step": 3686, "epoch": 3 }, { "type": "loss", "content": 0.06225605309009552, "timestamp": "2025-09-02 14:21:04.302835", "step": 3687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.357814", "step": 3687, "epoch": 3 }, { "type": "loss", "content": 0.18329232931137085, "timestamp": "2025-09-02 14:21:04.379494", "step": 3688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:04.437395", "step": 3688, "epoch": 3 }, { "type": "loss", "content": 0.1441662311553955, "timestamp": "2025-09-02 14:21:04.441153", "step": 3689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.510269", "step": 3689, "epoch": 3 }, { "type": "loss", "content": 0.05218483507633209, "timestamp": "2025-09-02 14:21:04.528968", "step": 3690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:04.599028", "step": 3690, "epoch": 3 }, { "type": "loss", "content": 0.1694667786359787, "timestamp": "2025-09-02 14:21:04.602300", "step": 3691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.656642", "step": 3691, "epoch": 3 }, { "type": "loss", "content": 0.08685601502656937, "timestamp": "2025-09-02 14:21:04.679443", "step": 3692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.746801", "step": 3692, "epoch": 3 }, { "type": "loss", "content": 0.14855323731899261, "timestamp": "2025-09-02 14:21:04.750867", "step": 3693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.807171", "step": 3693, "epoch": 3 }, { "type": "loss", "content": 0.19892767071723938, "timestamp": "2025-09-02 14:21:04.827794", "step": 3694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:04.898527", "step": 3694, "epoch": 3 }, { "type": "loss", "content": 0.09722639620304108, "timestamp": "2025-09-02 14:21:04.912699", "step": 3695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:04.978813", "step": 3695, "epoch": 3 }, { "type": "loss", "content": 0.2084965705871582, "timestamp": "2025-09-02 14:21:04.993803", "step": 3696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:05.059264", "step": 3696, "epoch": 3 }, { "type": "loss", "content": 0.055347103625535965, "timestamp": "2025-09-02 14:21:05.063314", "step": 3697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:05.117118", "step": 3697, "epoch": 3 }, { "type": "loss", "content": 0.06905371695756912, "timestamp": "2025-09-02 14:21:05.120343", "step": 3698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:05.176663", "step": 3698, "epoch": 3 }, { "type": "loss", "content": 0.14142587780952454, "timestamp": "2025-09-02 14:21:05.179189", "step": 3699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:05.232594", "step": 3699, "epoch": 3 }, { "type": "loss", "content": 0.16757924854755402, "timestamp": "2025-09-02 14:21:05.238969", "step": 3700, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:07.205606", "step": 3700, "epoch": 3 }, { "type": "pplx", "content": 8300.938720170006, "timestamp": "2025-09-02 14:21:07.208116", "step": 3700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.261709", "step": 3700, "epoch": 3 }, { "type": "loss", "content": 0.09657865762710571, "timestamp": "2025-09-02 14:21:07.264396", "step": 3701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:07.318617", "step": 3701, "epoch": 3 }, { "type": "loss", "content": 0.13335669040679932, "timestamp": "2025-09-02 14:21:07.320795", "step": 3702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.373916", "step": 3702, "epoch": 3 }, { "type": "loss", "content": 0.07898291200399399, "timestamp": "2025-09-02 14:21:07.376608", "step": 3703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.430174", "step": 3703, "epoch": 3 }, { "type": "loss", "content": 0.09194926917552948, "timestamp": "2025-09-02 14:21:07.436917", "step": 3704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.490019", "step": 3704, "epoch": 3 }, { "type": "loss", "content": 0.1352645456790924, "timestamp": "2025-09-02 14:21:07.493894", "step": 3705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:07.548883", "step": 3705, "epoch": 3 }, { "type": "loss", "content": 0.1130293756723404, "timestamp": "2025-09-02 14:21:07.551196", "step": 3706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.607793", "step": 3706, "epoch": 3 }, { "type": "loss", "content": 0.24910658597946167, "timestamp": "2025-09-02 14:21:07.610526", "step": 3707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.664240", "step": 3707, "epoch": 3 }, { "type": "loss", "content": 0.10871189087629318, "timestamp": "2025-09-02 14:21:07.670400", "step": 3708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.723716", "step": 3708, "epoch": 3 }, { "type": "loss", "content": 0.19738776981830597, "timestamp": "2025-09-02 14:21:07.726419", "step": 3709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.780824", "step": 3709, "epoch": 3 }, { "type": "loss", "content": 0.14543481171131134, "timestamp": "2025-09-02 14:21:07.783151", "step": 3710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:07.837846", "step": 3710, "epoch": 3 }, { "type": "loss", "content": 0.23947906494140625, "timestamp": "2025-09-02 14:21:07.840965", "step": 3711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:07.895914", "step": 3711, "epoch": 3 }, { "type": "loss", "content": 0.08881819248199463, "timestamp": "2025-09-02 14:21:07.902205", "step": 3712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:07.956153", "step": 3712, "epoch": 3 }, { "type": "loss", "content": 0.30187639594078064, "timestamp": "2025-09-02 14:21:07.958549", "step": 3713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.013235", "step": 3713, "epoch": 3 }, { "type": "loss", "content": 0.24059610068798065, "timestamp": "2025-09-02 14:21:08.015771", "step": 3714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.069485", "step": 3714, "epoch": 3 }, { "type": "loss", "content": 0.14325883984565735, "timestamp": "2025-09-02 14:21:08.072059", "step": 3715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.126118", "step": 3715, "epoch": 3 }, { "type": "loss", "content": 0.2983851134777069, "timestamp": "2025-09-02 14:21:08.131858", "step": 3716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.184310", "step": 3716, "epoch": 3 }, { "type": "loss", "content": 0.12818245589733124, "timestamp": "2025-09-02 14:21:08.186468", "step": 3717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.239611", "step": 3717, "epoch": 3 }, { "type": "loss", "content": 0.15758869051933289, "timestamp": "2025-09-02 14:21:08.241969", "step": 3718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.295724", "step": 3718, "epoch": 3 }, { "type": "loss", "content": 0.15895844995975494, "timestamp": "2025-09-02 14:21:08.297633", "step": 3719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:08.350650", "step": 3719, "epoch": 3 }, { "type": "loss", "content": 0.10404472053050995, "timestamp": "2025-09-02 14:21:08.357129", "step": 3720, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:10.239745", "step": 3720, "epoch": 3 }, { "type": "pplx", "content": 8654.205325523391, "timestamp": "2025-09-02 14:21:10.241795", "step": 3720, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3720", "timestamp": "2025-09-02 14:21:10.598679", "step": 3720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:10.657225", "step": 3720, "epoch": 3 }, { "type": "loss", "content": 0.12062975764274597, "timestamp": "2025-09-02 14:21:10.659614", "step": 3721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:10.713850", "step": 3721, "epoch": 3 }, { "type": "loss", "content": 0.08681948482990265, "timestamp": "2025-09-02 14:21:10.716306", "step": 3722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:10.770555", "step": 3722, "epoch": 3 }, { "type": "loss", "content": 0.1894225776195526, "timestamp": "2025-09-02 14:21:10.772613", "step": 3723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:10.826081", "step": 3723, "epoch": 3 }, { "type": "loss", "content": 0.1927589327096939, "timestamp": "2025-09-02 14:21:10.832125", "step": 3724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:10.884595", "step": 3724, "epoch": 3 }, { "type": "loss", "content": 0.12709173560142517, "timestamp": "2025-09-02 14:21:10.886850", "step": 3725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:10.941123", "step": 3725, "epoch": 3 }, { "type": "loss", "content": 0.1924726963043213, "timestamp": "2025-09-02 14:21:10.943523", "step": 3726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:10.996330", "step": 3726, "epoch": 3 }, { "type": "loss", "content": 0.1939297765493393, "timestamp": "2025-09-02 14:21:10.998701", "step": 3727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.051477", "step": 3727, "epoch": 3 }, { "type": "loss", "content": 0.12211831659078598, "timestamp": "2025-09-02 14:21:11.057582", "step": 3728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:11.110297", "step": 3728, "epoch": 3 }, { "type": "loss", "content": 0.16973745822906494, "timestamp": "2025-09-02 14:21:11.112385", "step": 3729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.165084", "step": 3729, "epoch": 3 }, { "type": "loss", "content": 0.17475302517414093, "timestamp": "2025-09-02 14:21:11.167165", "step": 3730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.220124", "step": 3730, "epoch": 3 }, { "type": "loss", "content": 0.08475841581821442, "timestamp": "2025-09-02 14:21:11.222722", "step": 3731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.275099", "step": 3731, "epoch": 3 }, { "type": "loss", "content": 0.09456567466259003, "timestamp": "2025-09-02 14:21:11.280944", "step": 3732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:11.333616", "step": 3732, "epoch": 3 }, { "type": "loss", "content": 0.03622080758213997, "timestamp": "2025-09-02 14:21:11.335938", "step": 3733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.388929", "step": 3733, "epoch": 3 }, { "type": "loss", "content": 0.03237542510032654, "timestamp": "2025-09-02 14:21:11.391296", "step": 3734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.444084", "step": 3734, "epoch": 3 }, { "type": "loss", "content": 0.25346624851226807, "timestamp": "2025-09-02 14:21:11.446492", "step": 3735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:11.500384", "step": 3735, "epoch": 3 }, { "type": "loss", "content": 0.1867988258600235, "timestamp": "2025-09-02 14:21:11.506177", "step": 3736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.558831", "step": 3736, "epoch": 3 }, { "type": "loss", "content": 0.08663009852170944, "timestamp": "2025-09-02 14:21:11.561075", "step": 3737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.613922", "step": 3737, "epoch": 3 }, { "type": "loss", "content": 0.10797131061553955, "timestamp": "2025-09-02 14:21:11.616015", "step": 3738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:11.668889", "step": 3738, "epoch": 3 }, { "type": "loss", "content": 0.14938785135746002, "timestamp": "2025-09-02 14:21:11.671829", "step": 3739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:11.724969", "step": 3739, "epoch": 3 }, { "type": "loss", "content": 0.1578887403011322, "timestamp": "2025-09-02 14:21:11.730987", "step": 3740, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:13.631480", "step": 3740, "epoch": 3 }, { "type": "pplx", "content": 8746.479742056816, "timestamp": "2025-09-02 14:21:13.633741", "step": 3740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:13.686225", "step": 3740, "epoch": 3 }, { "type": "loss", "content": 0.1489241123199463, "timestamp": "2025-09-02 14:21:13.688686", "step": 3741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:13.742197", "step": 3741, "epoch": 3 }, { "type": "loss", "content": 0.10440410673618317, "timestamp": "2025-09-02 14:21:13.745434", "step": 3742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:13.799796", "step": 3742, "epoch": 3 }, { "type": "loss", "content": 0.2732594609260559, "timestamp": "2025-09-02 14:21:13.802185", "step": 3743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:13.856092", "step": 3743, "epoch": 3 }, { "type": "loss", "content": 0.05595889315009117, "timestamp": "2025-09-02 14:21:13.862399", "step": 3744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:13.914445", "step": 3744, "epoch": 3 }, { "type": "loss", "content": 0.195451557636261, "timestamp": "2025-09-02 14:21:13.916697", "step": 3745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:13.969304", "step": 3745, "epoch": 3 }, { "type": "loss", "content": 0.17354324460029602, "timestamp": "2025-09-02 14:21:13.971867", "step": 3746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.028234", "step": 3746, "epoch": 3 }, { "type": "loss", "content": 0.16191868484020233, "timestamp": "2025-09-02 14:21:14.030977", "step": 3747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.084185", "step": 3747, "epoch": 3 }, { "type": "loss", "content": 0.1382880061864853, "timestamp": "2025-09-02 14:21:14.089883", "step": 3748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.145593", "step": 3748, "epoch": 3 }, { "type": "loss", "content": 0.20604048669338226, "timestamp": "2025-09-02 14:21:14.147857", "step": 3749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:14.200873", "step": 3749, "epoch": 3 }, { "type": "loss", "content": 0.15444105863571167, "timestamp": "2025-09-02 14:21:14.203207", "step": 3750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.260612", "step": 3750, "epoch": 3 }, { "type": "loss", "content": 0.15431207418441772, "timestamp": "2025-09-02 14:21:14.262966", "step": 3751, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.316301", "step": 3751, "epoch": 3 }, { "type": "loss", "content": 0.044626858085393906, "timestamp": "2025-09-02 14:21:14.322365", "step": 3752, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.374386", "step": 3752, "epoch": 3 }, { "type": "loss", "content": 0.09009623527526855, "timestamp": "2025-09-02 14:21:14.376967", "step": 3753, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.430040", "step": 3753, "epoch": 3 }, { "type": "loss", "content": 0.1538008749485016, "timestamp": "2025-09-02 14:21:14.432488", "step": 3754, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.488179", "step": 3754, "epoch": 3 }, { "type": "loss", "content": 0.16029655933380127, "timestamp": "2025-09-02 14:21:14.493385", "step": 3755, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.549141", "step": 3755, "epoch": 3 }, { "type": "loss", "content": 0.15758486092090607, "timestamp": "2025-09-02 14:21:14.555445", "step": 3756, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.608926", "step": 3756, "epoch": 3 }, { "type": "loss", "content": 0.16120706498622894, "timestamp": "2025-09-02 14:21:14.613474", "step": 3757, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.666594", "step": 3757, "epoch": 3 }, { "type": "loss", "content": 0.08993211388587952, "timestamp": "2025-09-02 14:21:14.670858", "step": 3758, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.724744", "step": 3758, "epoch": 3 }, { "type": "loss", "content": 0.12886913120746613, "timestamp": "2025-09-02 14:21:14.727216", "step": 3759, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:14.780011", "step": 3759, "epoch": 3 }, { "type": "loss", "content": 0.2807186543941498, "timestamp": "2025-09-02 14:21:14.786050", "step": 3760, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:16.711790", "step": 3760, "epoch": 3 }, { "type": "pplx", "content": 8799.791259901476, "timestamp": "2025-09-02 14:21:16.713575", "step": 3760, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3760", "timestamp": "2025-09-02 14:21:17.072250", "step": 3760, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:17.129827", "step": 3760, "epoch": 3 }, { "type": "loss", "content": 0.3745931088924408, "timestamp": "2025-09-02 14:21:17.131882", "step": 3761, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.185107", "step": 3761, "epoch": 3 }, { "type": "loss", "content": 0.10993775725364685, "timestamp": "2025-09-02 14:21:17.187342", "step": 3762, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.241241", "step": 3762, "epoch": 3 }, { "type": "loss", "content": 0.12845298647880554, "timestamp": "2025-09-02 14:21:17.243603", "step": 3763, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.296043", "step": 3763, "epoch": 3 }, { "type": "loss", "content": 0.22109824419021606, "timestamp": "2025-09-02 14:21:17.302099", "step": 3764, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:17.354114", "step": 3764, "epoch": 3 }, { "type": "loss", "content": 0.14241762459278107, "timestamp": "2025-09-02 14:21:17.356118", "step": 3765, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:17.409399", "step": 3765, "epoch": 3 }, { "type": "loss", "content": 0.08328738808631897, "timestamp": "2025-09-02 14:21:17.411863", "step": 3766, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.464792", "step": 3766, "epoch": 3 }, { "type": "loss", "content": 0.15988898277282715, "timestamp": "2025-09-02 14:21:17.467350", "step": 3767, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:17.520301", "step": 3767, "epoch": 3 }, { "type": "loss", "content": 0.35437947511672974, "timestamp": "2025-09-02 14:21:17.526076", "step": 3768, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.579136", "step": 3768, "epoch": 3 }, { "type": "loss", "content": 0.16190952062606812, "timestamp": "2025-09-02 14:21:17.581208", "step": 3769, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:17.634179", "step": 3769, "epoch": 3 }, { "type": "loss", "content": 0.10397324711084366, "timestamp": "2025-09-02 14:21:17.636370", "step": 3770, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:17.688989", "step": 3770, "epoch": 3 }, { "type": "loss", "content": 0.14572039246559143, "timestamp": "2025-09-02 14:21:17.691325", "step": 3771, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.744711", "step": 3771, "epoch": 3 }, { "type": "loss", "content": 0.12954586744308472, "timestamp": "2025-09-02 14:21:17.750644", "step": 3772, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.802764", "step": 3772, "epoch": 3 }, { "type": "loss", "content": 0.14624328911304474, "timestamp": "2025-09-02 14:21:17.805628", "step": 3773, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:17.858046", "step": 3773, "epoch": 3 }, { "type": "loss", "content": 0.1472795605659485, "timestamp": "2025-09-02 14:21:17.860406", "step": 3774, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:17.912712", "step": 3774, "epoch": 3 }, { "type": "loss", "content": 0.205438032746315, "timestamp": "2025-09-02 14:21:17.914723", "step": 3775, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:17.967341", "step": 3775, "epoch": 3 }, { "type": "loss", "content": 0.31924372911453247, "timestamp": "2025-09-02 14:21:17.972895", "step": 3776, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:18.024808", "step": 3776, "epoch": 3 }, { "type": "loss", "content": 0.0879032239317894, "timestamp": "2025-09-02 14:21:18.027071", "step": 3777, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:18.080606", "step": 3777, "epoch": 3 }, { "type": "loss", "content": 0.0928167924284935, "timestamp": "2025-09-02 14:21:18.083034", "step": 3778, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:18.135844", "step": 3778, "epoch": 3 }, { "type": "loss", "content": 0.15693019330501556, "timestamp": "2025-09-02 14:21:18.138137", "step": 3779, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:18.191668", "step": 3779, "epoch": 3 }, { "type": "loss", "content": 0.1866833120584488, "timestamp": "2025-09-02 14:21:18.197469", "step": 3780, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:20.071624", "step": 3780, "epoch": 3 }, { "type": "pplx", "content": 8421.717354799803, "timestamp": "2025-09-02 14:21:20.073812", "step": 3780, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:20.127041", "step": 3780, "epoch": 3 }, { "type": "loss", "content": 0.3369508981704712, "timestamp": "2025-09-02 14:21:20.129443", "step": 3781, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.183383", "step": 3781, "epoch": 3 }, { "type": "loss", "content": 0.2385195940732956, "timestamp": "2025-09-02 14:21:20.186000", "step": 3782, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:20.240040", "step": 3782, "epoch": 3 }, { "type": "loss", "content": 0.21962815523147583, "timestamp": "2025-09-02 14:21:20.242127", "step": 3783, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.295711", "step": 3783, "epoch": 3 }, { "type": "loss", "content": 0.07102625072002411, "timestamp": "2025-09-02 14:21:20.302317", "step": 3784, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:20.354812", "step": 3784, "epoch": 3 }, { "type": "loss", "content": 0.1265784054994583, "timestamp": "2025-09-02 14:21:20.357053", "step": 3785, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.409968", "step": 3785, "epoch": 3 }, { "type": "loss", "content": 0.10605842620134354, "timestamp": "2025-09-02 14:21:20.412938", "step": 3786, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.465909", "step": 3786, "epoch": 3 }, { "type": "loss", "content": 0.09101241827011108, "timestamp": "2025-09-02 14:21:20.468362", "step": 3787, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.521383", "step": 3787, "epoch": 3 }, { "type": "loss", "content": 0.15990766882896423, "timestamp": "2025-09-02 14:21:20.527358", "step": 3788, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.581401", "step": 3788, "epoch": 3 }, { "type": "loss", "content": 0.097979336977005, "timestamp": "2025-09-02 14:21:20.583956", "step": 3789, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.636688", "step": 3789, "epoch": 3 }, { "type": "loss", "content": 0.21833744645118713, "timestamp": "2025-09-02 14:21:20.638969", "step": 3790, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:20.691686", "step": 3790, "epoch": 3 }, { "type": "loss", "content": 0.10212662816047668, "timestamp": "2025-09-02 14:21:20.694033", "step": 3791, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.748242", "step": 3791, "epoch": 3 }, { "type": "loss", "content": 0.1725444346666336, "timestamp": "2025-09-02 14:21:20.754381", "step": 3792, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.806694", "step": 3792, "epoch": 3 }, { "type": "loss", "content": 0.160175159573555, "timestamp": "2025-09-02 14:21:20.809092", "step": 3793, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.861539", "step": 3793, "epoch": 3 }, { "type": "loss", "content": 0.1672595739364624, "timestamp": "2025-09-02 14:21:20.864274", "step": 3794, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.920759", "step": 3794, "epoch": 3 }, { "type": "loss", "content": 0.16126006841659546, "timestamp": "2025-09-02 14:21:20.923185", "step": 3795, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:20.982032", "step": 3795, "epoch": 3 }, { "type": "loss", "content": 0.16562600433826447, "timestamp": "2025-09-02 14:21:20.988063", "step": 3796, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:21.042331", "step": 3796, "epoch": 3 }, { "type": "loss", "content": 0.11972685158252716, "timestamp": "2025-09-02 14:21:21.044508", "step": 3797, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:21.097291", "step": 3797, "epoch": 3 }, { "type": "loss", "content": 0.052742768079042435, "timestamp": "2025-09-02 14:21:21.099375", "step": 3798, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:21.152337", "step": 3798, "epoch": 3 }, { "type": "loss", "content": 0.11112796515226364, "timestamp": "2025-09-02 14:21:21.154890", "step": 3799, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:21.207579", "step": 3799, "epoch": 3 }, { "type": "loss", "content": 0.33502283692359924, "timestamp": "2025-09-02 14:21:21.212992", "step": 3800, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:23.090112", "step": 3800, "epoch": 3 }, { "type": "pplx", "content": 8172.90826030583, "timestamp": "2025-09-02 14:21:23.092538", "step": 3800, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3800", "timestamp": "2025-09-02 14:21:23.463912", "step": 3800, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.519972", "step": 3800, "epoch": 3 }, { "type": "loss", "content": 0.13203787803649902, "timestamp": "2025-09-02 14:21:23.522377", "step": 3801, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.576311", "step": 3801, "epoch": 3 }, { "type": "loss", "content": 0.1623270958662033, "timestamp": "2025-09-02 14:21:23.578479", "step": 3802, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.631512", "step": 3802, "epoch": 3 }, { "type": "loss", "content": 0.11439917981624603, "timestamp": "2025-09-02 14:21:23.633775", "step": 3803, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.686814", "step": 3803, "epoch": 3 }, { "type": "loss", "content": 0.10638491064310074, "timestamp": "2025-09-02 14:21:23.693039", "step": 3804, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.745352", "step": 3804, "epoch": 3 }, { "type": "loss", "content": 0.28129252791404724, "timestamp": "2025-09-02 14:21:23.747548", "step": 3805, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.800661", "step": 3805, "epoch": 3 }, { "type": "loss", "content": 0.12955987453460693, "timestamp": "2025-09-02 14:21:23.803258", "step": 3806, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:23.856275", "step": 3806, "epoch": 3 }, { "type": "loss", "content": 0.17236921191215515, "timestamp": "2025-09-02 14:21:23.858396", "step": 3807, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:23.911766", "step": 3807, "epoch": 3 }, { "type": "loss", "content": 0.18822801113128662, "timestamp": "2025-09-02 14:21:23.917908", "step": 3808, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:23.970354", "step": 3808, "epoch": 3 }, { "type": "loss", "content": 0.14580923318862915, "timestamp": "2025-09-02 14:21:23.972832", "step": 3809, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.026239", "step": 3809, "epoch": 3 }, { "type": "loss", "content": 0.1157793328166008, "timestamp": "2025-09-02 14:21:24.028593", "step": 3810, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.082045", "step": 3810, "epoch": 3 }, { "type": "loss", "content": 0.17365866899490356, "timestamp": "2025-09-02 14:21:24.084472", "step": 3811, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:24.137400", "step": 3811, "epoch": 3 }, { "type": "loss", "content": 0.067583367228508, "timestamp": "2025-09-02 14:21:24.143149", "step": 3812, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.195468", "step": 3812, "epoch": 3 }, { "type": "loss", "content": 0.07229101657867432, "timestamp": "2025-09-02 14:21:24.198446", "step": 3813, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.251045", "step": 3813, "epoch": 3 }, { "type": "loss", "content": 0.2815386652946472, "timestamp": "2025-09-02 14:21:24.253266", "step": 3814, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.306280", "step": 3814, "epoch": 3 }, { "type": "loss", "content": 0.18720191717147827, "timestamp": "2025-09-02 14:21:24.308538", "step": 3815, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.360761", "step": 3815, "epoch": 3 }, { "type": "loss", "content": 0.15254339575767517, "timestamp": "2025-09-02 14:21:24.366731", "step": 3816, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.419002", "step": 3816, "epoch": 3 }, { "type": "loss", "content": 0.14160726964473724, "timestamp": "2025-09-02 14:21:24.421315", "step": 3817, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:24.473802", "step": 3817, "epoch": 3 }, { "type": "loss", "content": 0.16982300579547882, "timestamp": "2025-09-02 14:21:24.476156", "step": 3818, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.530104", "step": 3818, "epoch": 3 }, { "type": "loss", "content": 0.28235185146331787, "timestamp": "2025-09-02 14:21:24.532387", "step": 3819, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:24.584953", "step": 3819, "epoch": 3 }, { "type": "loss", "content": 0.2323727160692215, "timestamp": "2025-09-02 14:21:24.590569", "step": 3820, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:26.517831", "step": 3820, "epoch": 3 }, { "type": "pplx", "content": 8059.643993534743, "timestamp": "2025-09-02 14:21:26.520009", "step": 3820, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.572198", "step": 3820, "epoch": 3 }, { "type": "loss", "content": 0.1592533141374588, "timestamp": "2025-09-02 14:21:26.574407", "step": 3821, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.627286", "step": 3821, "epoch": 3 }, { "type": "loss", "content": 0.11756738275289536, "timestamp": "2025-09-02 14:21:26.629433", "step": 3822, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.683105", "step": 3822, "epoch": 3 }, { "type": "loss", "content": 0.1388217806816101, "timestamp": "2025-09-02 14:21:26.685354", "step": 3823, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.738237", "step": 3823, "epoch": 3 }, { "type": "loss", "content": 0.21314001083374023, "timestamp": "2025-09-02 14:21:26.744372", "step": 3824, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.796581", "step": 3824, "epoch": 3 }, { "type": "loss", "content": 0.09537056088447571, "timestamp": "2025-09-02 14:21:26.798930", "step": 3825, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.852011", "step": 3825, "epoch": 3 }, { "type": "loss", "content": 0.16938985884189606, "timestamp": "2025-09-02 14:21:26.854373", "step": 3826, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.907361", "step": 3826, "epoch": 3 }, { "type": "loss", "content": 0.06443826109170914, "timestamp": "2025-09-02 14:21:26.910270", "step": 3827, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:26.964411", "step": 3827, "epoch": 3 }, { "type": "loss", "content": 0.22213725745677948, "timestamp": "2025-09-02 14:21:26.970039", "step": 3828, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.022364", "step": 3828, "epoch": 3 }, { "type": "loss", "content": 0.1835777312517166, "timestamp": "2025-09-02 14:21:27.025121", "step": 3829, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.077984", "step": 3829, "epoch": 3 }, { "type": "loss", "content": 0.111510269343853, "timestamp": "2025-09-02 14:21:27.080906", "step": 3830, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.133922", "step": 3830, "epoch": 3 }, { "type": "loss", "content": 0.16694819927215576, "timestamp": "2025-09-02 14:21:27.136114", "step": 3831, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.188651", "step": 3831, "epoch": 3 }, { "type": "loss", "content": 0.0829571858048439, "timestamp": "2025-09-02 14:21:27.194399", "step": 3832, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.247490", "step": 3832, "epoch": 3 }, { "type": "loss", "content": 0.12238135933876038, "timestamp": "2025-09-02 14:21:27.249858", "step": 3833, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.302420", "step": 3833, "epoch": 3 }, { "type": "loss", "content": 0.13774190843105316, "timestamp": "2025-09-02 14:21:27.304663", "step": 3834, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:27.358377", "step": 3834, "epoch": 3 }, { "type": "loss", "content": 0.07566042989492416, "timestamp": "2025-09-02 14:21:27.360721", "step": 3835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.413574", "step": 3835, "epoch": 3 }, { "type": "loss", "content": 0.09290306270122528, "timestamp": "2025-09-02 14:21:27.419285", "step": 3836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.471895", "step": 3836, "epoch": 3 }, { "type": "loss", "content": 0.11594920605421066, "timestamp": "2025-09-02 14:21:27.474605", "step": 3837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:27.527830", "step": 3837, "epoch": 3 }, { "type": "loss", "content": 0.10716418921947479, "timestamp": "2025-09-02 14:21:27.532400", "step": 3838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.587309", "step": 3838, "epoch": 3 }, { "type": "loss", "content": 0.22407253086566925, "timestamp": "2025-09-02 14:21:27.589647", "step": 3839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:27.642178", "step": 3839, "epoch": 3 }, { "type": "loss", "content": 0.15009889006614685, "timestamp": "2025-09-02 14:21:27.648328", "step": 3840, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:29.528070", "step": 3840, "epoch": 3 }, { "type": "pplx", "content": 7742.626222973535, "timestamp": "2025-09-02 14:21:29.530433", "step": 3840, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3840", "timestamp": "2025-09-02 14:21:29.911893", "step": 3840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:29.969042", "step": 3840, "epoch": 3 }, { "type": "loss", "content": 0.18011435866355896, "timestamp": "2025-09-02 14:21:29.971379", "step": 3841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.025041", "step": 3841, "epoch": 3 }, { "type": "loss", "content": 0.19560381770133972, "timestamp": "2025-09-02 14:21:30.027405", "step": 3842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.080303", "step": 3842, "epoch": 3 }, { "type": "loss", "content": 0.06726943701505661, "timestamp": "2025-09-02 14:21:30.082761", "step": 3843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.136102", "step": 3843, "epoch": 3 }, { "type": "loss", "content": 0.14549490809440613, "timestamp": "2025-09-02 14:21:30.142281", "step": 3844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:30.194905", "step": 3844, "epoch": 3 }, { "type": "loss", "content": 0.18603573739528656, "timestamp": "2025-09-02 14:21:30.197724", "step": 3845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.251026", "step": 3845, "epoch": 3 }, { "type": "loss", "content": 0.16088120639324188, "timestamp": "2025-09-02 14:21:30.253295", "step": 3846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:30.307795", "step": 3846, "epoch": 3 }, { "type": "loss", "content": 0.19374307990074158, "timestamp": "2025-09-02 14:21:30.310223", "step": 3847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.363851", "step": 3847, "epoch": 3 }, { "type": "loss", "content": 0.1342175006866455, "timestamp": "2025-09-02 14:21:30.369835", "step": 3848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:30.422228", "step": 3848, "epoch": 3 }, { "type": "loss", "content": 0.11649564653635025, "timestamp": "2025-09-02 14:21:30.424638", "step": 3849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:30.477597", "step": 3849, "epoch": 3 }, { "type": "loss", "content": 0.16206708550453186, "timestamp": "2025-09-02 14:21:30.479956", "step": 3850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.532824", "step": 3850, "epoch": 3 }, { "type": "loss", "content": 0.1385522335767746, "timestamp": "2025-09-02 14:21:30.535055", "step": 3851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.589416", "step": 3851, "epoch": 3 }, { "type": "loss", "content": 0.09341373294591904, "timestamp": "2025-09-02 14:21:30.595673", "step": 3852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.649080", "step": 3852, "epoch": 3 }, { "type": "loss", "content": 0.16133441030979156, "timestamp": "2025-09-02 14:21:30.651228", "step": 3853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.704155", "step": 3853, "epoch": 3 }, { "type": "loss", "content": 0.2501571774482727, "timestamp": "2025-09-02 14:21:30.706472", "step": 3854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.761057", "step": 3854, "epoch": 3 }, { "type": "loss", "content": 0.19694547355175018, "timestamp": "2025-09-02 14:21:30.763326", "step": 3855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:30.817084", "step": 3855, "epoch": 3 }, { "type": "loss", "content": 0.27300530672073364, "timestamp": "2025-09-02 14:21:30.823336", "step": 3856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:30.876151", "step": 3856, "epoch": 3 }, { "type": "loss", "content": 0.21703201532363892, "timestamp": "2025-09-02 14:21:30.878741", "step": 3857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:30.931714", "step": 3857, "epoch": 3 }, { "type": "loss", "content": 0.13791173696517944, "timestamp": "2025-09-02 14:21:30.934350", "step": 3858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:30.988076", "step": 3858, "epoch": 3 }, { "type": "loss", "content": 0.18849653005599976, "timestamp": "2025-09-02 14:21:30.990400", "step": 3859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:31.043700", "step": 3859, "epoch": 3 }, { "type": "loss", "content": 0.16978701949119568, "timestamp": "2025-09-02 14:21:31.049847", "step": 3860, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:33.011149", "step": 3860, "epoch": 3 }, { "type": "pplx", "content": 7579.420168346346, "timestamp": "2025-09-02 14:21:33.015069", "step": 3860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.068283", "step": 3860, "epoch": 3 }, { "type": "loss", "content": 0.23943738639354706, "timestamp": "2025-09-02 14:21:33.071476", "step": 3861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.125014", "step": 3861, "epoch": 3 }, { "type": "loss", "content": 0.15053339302539825, "timestamp": "2025-09-02 14:21:33.127586", "step": 3862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:33.180675", "step": 3862, "epoch": 3 }, { "type": "loss", "content": 0.08064207434654236, "timestamp": "2025-09-02 14:21:33.183150", "step": 3863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.236322", "step": 3863, "epoch": 3 }, { "type": "loss", "content": 0.08223865926265717, "timestamp": "2025-09-02 14:21:33.242620", "step": 3864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.294930", "step": 3864, "epoch": 3 }, { "type": "loss", "content": 0.19900067150592804, "timestamp": "2025-09-02 14:21:33.297180", "step": 3865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.350148", "step": 3865, "epoch": 3 }, { "type": "loss", "content": 0.1039421409368515, "timestamp": "2025-09-02 14:21:33.352976", "step": 3866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.406075", "step": 3866, "epoch": 3 }, { "type": "loss", "content": 0.0903979167342186, "timestamp": "2025-09-02 14:21:33.408283", "step": 3867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.461681", "step": 3867, "epoch": 3 }, { "type": "loss", "content": 0.15289445221424103, "timestamp": "2025-09-02 14:21:33.468470", "step": 3868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.524792", "step": 3868, "epoch": 3 }, { "type": "loss", "content": 0.130441352725029, "timestamp": "2025-09-02 14:21:33.527847", "step": 3869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.580769", "step": 3869, "epoch": 3 }, { "type": "loss", "content": 0.12424968928098679, "timestamp": "2025-09-02 14:21:33.583241", "step": 3870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.635992", "step": 3870, "epoch": 3 }, { "type": "loss", "content": 0.08035693317651749, "timestamp": "2025-09-02 14:21:33.638314", "step": 3871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:33.691044", "step": 3871, "epoch": 3 }, { "type": "loss", "content": 0.15148459374904633, "timestamp": "2025-09-02 14:21:33.697155", "step": 3872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.751027", "step": 3872, "epoch": 3 }, { "type": "loss", "content": 0.28570178151130676, "timestamp": "2025-09-02 14:21:33.753789", "step": 3873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.807277", "step": 3873, "epoch": 3 }, { "type": "loss", "content": 0.1342833936214447, "timestamp": "2025-09-02 14:21:33.810286", "step": 3874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.863237", "step": 3874, "epoch": 3 }, { "type": "loss", "content": 0.21334195137023926, "timestamp": "2025-09-02 14:21:33.866001", "step": 3875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:33.919340", "step": 3875, "epoch": 3 }, { "type": "loss", "content": 0.16234147548675537, "timestamp": "2025-09-02 14:21:33.925331", "step": 3876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:33.978274", "step": 3876, "epoch": 3 }, { "type": "loss", "content": 0.20673879981040955, "timestamp": "2025-09-02 14:21:33.981834", "step": 3877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:34.035506", "step": 3877, "epoch": 3 }, { "type": "loss", "content": 0.07943962514400482, "timestamp": "2025-09-02 14:21:34.037853", "step": 3878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:34.091038", "step": 3878, "epoch": 3 }, { "type": "loss", "content": 0.23913614451885223, "timestamp": "2025-09-02 14:21:34.093379", "step": 3879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:34.146505", "step": 3879, "epoch": 3 }, { "type": "loss", "content": 0.12242354452610016, "timestamp": "2025-09-02 14:21:34.152472", "step": 3880, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:36.033873", "step": 3880, "epoch": 3 }, { "type": "pplx", "content": 7395.81694599993, "timestamp": "2025-09-02 14:21:36.036011", "step": 3880, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 3880", "timestamp": "2025-09-02 14:21:36.394349", "step": 3880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:36.451611", "step": 3880, "epoch": 3 }, { "type": "loss", "content": 0.10765019804239273, "timestamp": "2025-09-02 14:21:36.453999", "step": 3881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 2880017550912.0 }, "timestamp": "2025-09-02 14:21:36.508327", "step": 3881, "epoch": 3 }, { "type": "loss", "content": 0.10210403800010681, "timestamp": "2025-09-02 14:21:36.510681", "step": 3882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:36.568976", "step": 3882, "epoch": 3 }, { "type": "loss", "content": 0.15322604775428772, "timestamp": "2025-09-02 14:21:36.571194", "step": 3883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:36.624518", "step": 3883, "epoch": 3 }, { "type": "loss", "content": 0.18226973712444305, "timestamp": "2025-09-02 14:21:36.631107", "step": 3884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:36.685043", "step": 3884, "epoch": 3 }, { "type": "loss", "content": 0.12960737943649292, "timestamp": "2025-09-02 14:21:36.687450", "step": 3885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:36.741565", "step": 3885, "epoch": 3 }, { "type": "loss", "content": 0.05072679743170738, "timestamp": "2025-09-02 14:21:36.744153", "step": 3886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:36.798792", "step": 3886, "epoch": 3 }, { "type": "loss", "content": 0.18406963348388672, "timestamp": "2025-09-02 14:21:36.801319", "step": 3887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:36.855346", "step": 3887, "epoch": 3 }, { "type": "loss", "content": 0.178181454539299, "timestamp": "2025-09-02 14:21:36.861813", "step": 3888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:36.916897", "step": 3888, "epoch": 3 }, { "type": "loss", "content": 0.1698291003704071, "timestamp": "2025-09-02 14:21:36.919350", "step": 3889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:36.973698", "step": 3889, "epoch": 3 }, { "type": "loss", "content": 0.14309123158454895, "timestamp": "2025-09-02 14:21:36.976530", "step": 3890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:37.029752", "step": 3890, "epoch": 3 }, { "type": "loss", "content": 0.11882942914962769, "timestamp": "2025-09-02 14:21:37.032355", "step": 3891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:37.085819", "step": 3891, "epoch": 3 }, { "type": "loss", "content": 0.13527408242225647, "timestamp": "2025-09-02 14:21:37.091848", "step": 3892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:37.144413", "step": 3892, "epoch": 3 }, { "type": "loss", "content": 0.1447136253118515, "timestamp": "2025-09-02 14:21:37.147052", "step": 3893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:37.200329", "step": 3893, "epoch": 3 }, { "type": "loss", "content": 0.0968262329697609, "timestamp": "2025-09-02 14:21:37.202724", "step": 3894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:37.256923", "step": 3894, "epoch": 3 }, { "type": "loss", "content": 0.13071610033512115, "timestamp": "2025-09-02 14:21:37.259592", "step": 3895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:37.313344", "step": 3895, "epoch": 3 }, { "type": "loss", "content": 0.31222033500671387, "timestamp": "2025-09-02 14:21:37.319470", "step": 3896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:37.373986", "step": 3896, "epoch": 3 }, { "type": "loss", "content": 0.08790139853954315, "timestamp": "2025-09-02 14:21:37.376890", "step": 3897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:37.430838", "step": 3897, "epoch": 3 }, { "type": "loss", "content": 0.08164341002702713, "timestamp": "2025-09-02 14:21:37.433186", "step": 3898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 2560015608320.0 }, "timestamp": "2025-09-02 14:21:37.487046", "step": 3898, "epoch": 3 }, { "type": "loss", "content": 0.17312836647033691, "timestamp": "2025-09-02 14:21:37.489232", "step": 3899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 2240013665728.0 }, "timestamp": "2025-09-02 14:21:37.544254", "step": 3899, "epoch": 3 }, { "type": "loss", "content": 0.09472893178462982, "timestamp": "2025-09-02 14:21:37.550464", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:39.445333", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 7282.694216341476, "timestamp": "2025-09-02 14:21:39.447574", "step": 3900, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1596914505344 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2235680280448 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1916297392896 }, { "type": "perplexity", "in_batch_dim": [ 2, 96 ], "batch_size": 8, "flops": 1916297392896 } ], "timestamp": "2025-09-02 14:21:41.322373", "step": 3900, "epoch": 3 }, { "type": "pplx", "content": 7282.694216341476, "timestamp": "2025-09-02 14:21:41.324570", "step": 3900, "epoch": 3 }, { "type": "best_pplx", "content": 6936.326412348843, "timestamp": "2025-09-02 14:21:41.326186", "step": 3900, "epoch": 3 }, { "type": "best_step", "content": 2940, "timestamp": "2025-09-02 14:21:41.327682", "step": 3900, "epoch": 3 }, { "type": "total_pplx_flops", "content": 31522133944471680, "timestamp": "2025-09-02 14:21:41.329044", "step": 3900, "epoch": 3 }, { "type": "total_train_flops", "content": 1.00080610181424e+16, "timestamp": "2025-09-02 14:21:41.331242", "step": 3900, "epoch": 3 } ], "best_evals": { "pplx": { "score": 6936.326412348843, "step": 2940 }, "rouge1": { "precision": 0.42037362637362635, "recall": 0.4026062271062271, "fmeasure": 0.38750327450327454 } } }